In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from_date = '2018-08-14T00:00:00'
to_date = '2018-08-15T00:00:00'

In [4]:
import pandas as pd
from datetime import datetime
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

from models.article import Article

In [5]:
articles = Article.load_articles_from_db(from_date, to_date)
len(articles)

SELECT * FROM articles WHERE published_at >= '2018-08-14T00:00:00' AND published_at < '2018-08-15T00:00:00'


1864

In [6]:
# Create tf_idf matrix from articles
test_url = 'http://www.dailymail.co.uk/news/article-6058937/Huge-section-highway-bridge-collapses-Genoa.html'
test_article = Article(test_url, '', '', '', datetime.now())


# List of named entities
named_entities_list = list(map(lambda x: ' '.join(x.named_entities), articles))
named_entities_list.append(' '.join(test_article.named_entities))

# TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(named_entities_list)

# Fit KNN
nbrs = NearestNeighbors(n_neighbors=9) 
nbrs.fit(tfidf_matrix)

# Predict
test_row = tfidf_matrix.getrow(len(named_entities_list) - 1)
distances, indices = nbrs.kneighbors(test_row)

# Format predictions
similar_articles = []
for idx in indices.flatten()[1:]:
    similar_articles.append(articles[idx])
    
df = pd.DataFrame({
    'distance': distances.flatten()[1:],
    'titles': list(map(lambda x: x.title, similar_articles)),
    'named_entities': list(map(lambda x: x.named_entities, similar_articles)),
    'url': list(map(lambda x: x.url, similar_articles)),

})
pd.set_option('display.max_colwidth', -1)
print(df)

   distance  \
0  0.728980   
1  0.861310   
2  0.864373   
3  0.867383   
4  0.905327   
5  0.925451   
6  0.925451   
7  0.925451   

                                                                   titles  \
0  The Latest: Vehicles involved in highway collapse in Italy               
1  Motorway bridge collapses in Italy                                       
2  Italian motorway bridge collapses near Genoa                             
3  Reports: Raised highway collapses in Italian city of Genoa               
4  Genoa bridge collapse: What might be responsible?                        
5  Italy motorway bridge collapses near Genoa, so far no victims reported   
6  Italy motorway bridge collapses near Genoa, so far no victims reported   
7  Italy motorway bridge collapses over Genoa, "dozens" feared dead         

                                                                                                                                                                          