In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
article_date = '2018-12-21'

In [4]:
import ast
import pandas as pd
from datetime import datetime
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sherlockml import datasets

from models.article import Article

In [30]:
def load_article_from_datasets(foo):
    articles = []
    with datasets.open(f'/input/article_content/{foo}.csv') as f:
        df = pd.read_csv(f, sep='\t', encoding='utf-8')
    for row in df.iterrows():
        try:
            articles.append(Article(
                row[1]['article_url'], 
                row[1]['article_title'],
                row[1]['article_description'],
                row[1]['source_id'],
                row[1]['published_at'],
                row[1]['article_uuid'],
                ast.literal_eval(row[1]['named_entities']),
                None,
                row[1]['raw_content']
            ))
        except:
            pass
    return articles

In [31]:
articles = load_article_from_datasets(article_date)

In [33]:
# Create tf_idf matrix from articles
test_url = 'https://www.bbc.co.uk/news/world-us-canada-46657393'
test_article = Article(test_url, '', '', '', datetime.now())

# List of named entities
named_entities_list = list(map(lambda x: ' '.join(x.named_entities), articles))
named_entities_list.append(' '.join(test_article.named_entities))

# TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(named_entities_list)

# Fit KNN
nbrs = NearestNeighbors(n_neighbors=10) 
nbrs.fit(tfidf_matrix)

# Predict
test_row = tfidf_matrix.getrow(len(named_entities_list) - 1)
distances, indices = nbrs.kneighbors(test_row)

# Format predictions
similar_articles = []
for idx in indices.flatten()[1:]:
    similar_articles.append(articles[idx])
    
df = pd.DataFrame({
    'distance': distances.flatten()[1:],
    'titles': list(map(lambda x: x.title, similar_articles)),
    'named_entities': list(map(lambda x: x.named_entities, similar_articles)),
    'url': list(map(lambda x: x.url, similar_articles)),
})
pd.set_option('display.max_colwidth', -1)
print(df)

938
   distance  \
0  0.000000   
1  0.776288   
2  0.776288   
3  0.776288   
4  0.807353   
5  0.846096   
6  0.846096   
7  0.899571   
8  0.947355   

                                                                           titles  \
0  US government shutdown looms over border wall row                                
1  UPDATE 1-U.S. government partially shut down in fight over Trump's border wall   
2  Government partially shut down in fight over Trump's border wall                 
3  U.S. government partially shut down in fight over Trump's border wall            
4  Trump and Democrats play blame game over government shutdown                     
5  'TWAS THE SHUTDOWN BEFORE CHRISTMAS...                                           
6  The Latest: Negotiations expected to resume over shutdown                        
7  U.S. government partially shut down in fight over Trump's border wall            
8  America’s government shuts down, once again                                  