In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
from_date = '2018-08-14T00:00:00'
to_date = '2018-08-15T00:00:00'

In [6]:
import pandas as pd
from datetime import datetime
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

from models.article import Article

In [11]:
articles = Article.load_articles_from_db(from_date, to_date)
len(articles)

SELECT * FROM articles WHERE published_at >= '2018-08-14T00:00:00' AND published_at < '2018-08-15T00:00:00'


1864

In [18]:
# Create tf_idf matrix from articles
test_url = 'https://www.bbc.co.uk/news/uk-politics-45208358'
test_article = Article(test_url, '', '', '', datetime.now())


# List of named entities
named_entities_list = list(map(lambda x: ' '.join(x.named_entities), articles))
named_entities_list.append(' '.join(test_article.named_entities))

# TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(named_entities_list)

# Fit KNN
nbrs = NearestNeighbors(n_neighbors=10) 
nbrs.fit(tfidf_matrix)

# Predict
test_row = tfidf_matrix.getrow(len(named_entities_list) - 1)
distances, indices = nbrs.kneighbors(test_row)

# Format predictions
similar_articles = []
for idx in indices.flatten()[1:]:
    similar_articles.append(articles[idx])
    
df = pd.DataFrame({
    'distance': distances.flatten()[1:],
    'titles': list(map(lambda x: x.title, similar_articles)),
    'named_entities': list(map(lambda x: x.named_entities, similar_articles)),
    'url': list(map(lambda x: x.url, similar_articles)),

})
pd.set_option('display.max_colwidth', -1)
print(df)

   distance  \
0  0.982914   
1  0.988990   
2  1.054206   
3  1.072476   
4  1.089114   
5  1.133798   
6  1.133871   
7  1.216241   
8  1.216255   

                                                                                                          titles  \
0  Labour frustrated as message drowned out by Corbyn wreath row                                                   
1  Jeremy Corbyn will not apologise over Tunisia wreath row                                                        
2  To put antisemitism claims behind him, Jeremy Corbyn needs to state clearly his position on Israel              
3  The Corbyn wreath ‘scandal’ is just an exercise in hypocrisy | Owen Jones                                       
4  No, this Netanyahu row won"t destroy Corbyn – it will only make him stronger                                    
5  Tory peer admits he was also at Palestinian conference at centre of Jeremy Corbyn"s wreath-laying controversy   
6  Netanyahu is a brutal bully – but 