In [1]:
from app.models import Session, Headline, Article, Agency
from app.queries import Queries
s = Session()

articles = Queries.get_todays_articles(s).all()
articles


In [2]:
import pandas as pd
df = pd.DataFrame([[a.most_recent_headline().title, a.last_accessed, a.agency.name] for a in articles], columns=['headline', 'last_accessed', 'agency'])
df.head()

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from app.analysis.pipelines import Pipelines, prepare, trem, tnorm, STOPWORDS

pipeline = [
    str.lower,
    Pipelines.split_camelcase,
    tnorm.hyphenated_words,
    tnorm.quotation_marks,
    tnorm.unicode,
    tnorm.whitespace,
    trem.accents,
    trem.brackets,
    trem.punctuation,
    Pipelines.tokenize,
    Pipelines.decontract,
    lambda x: Pipelines.remove_stop(x, STOPWORDS),
    # lambda x: ' '.join(x)
]

tfidf = TfidfVectorizer(
    tokenizer=lambda x: prepare(x, pipeline),
    max_features=1000
)
dt = tfidf.fit_transform(df['headline'])

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
sim = cosine_similarity(dt, dt)
sim

In [8]:
simdf = pd.DataFrame(sim, columns=df['headline'], index=df['headline'])
simdf

In [14]:
# for each headline, put the column name of the most similar headline < 1.0, excluding the identical headline, in a new column
simdf['most_similar'] = simdf.apply(lambda x: x[x < 1.0].idxmax(), axis=1)
simdf[['headline', 'most_similar']]