In [81]:
from app.models import Session, Headline, Article, Agency
from app.queries import Queries
s = Session()

articles = Queries.get_todays_articles(s).all()
articles


In [82]:
import pandas as pd
df = pd.DataFrame([[a.most_recent_headline().title, a.last_accessed, a.agency.name] for a in articles], columns=['headline', 'last_accessed', 'agency'])
df.head()

In [83]:
repr_sample = df[df['headline'].str.contains('Musk')].iloc[2].headline
repr_sample

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
from app.pipelines import Pipelines, prepare, trem, tnorm, STOPWORDS

pipeline = [
    str.lower,
    Pipelines.split_camelcase,
    tnorm.hyphenated_words,
    tnorm.quotation_marks,
    tnorm.unicode,
    tnorm.whitespace,
    trem.accents,
    trem.brackets,
    trem.punctuation,
    Pipelines.tokenize,
    Pipelines.decontract,
    lambda x: Pipelines.remove_stop(x, STOPWORDS),
    # lambda x: ' '.join(x)
]

tfidf = TfidfVectorizer(
    tokenizer=lambda x: prepare(x, pipeline),
    max_features=1000
)
dt = tfidf.fit_transform(df['headline'])

In [85]:
rep = tfidf.transform([repr_sample])

In [86]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt, rep)

In [87]:
df['similarity'] = cosine_similarity(dt, rep)
df = df.sort_values('similarity', ascending=False)

In [98]:
len(df[df['similarity'] > 0.5])

In [99]:
df[df['headline'].str.contains('Musk')]['similarity'].hist()

In [105]:
cutoff = 0.4
print("Full len:", len(df[df['similarity'] > cutoff]))
df[df['similarity'] > cutoff].sort_values('similarity', ascending=True).head(10)
contains = len(df[(df['similarity'] > cutoff) & (df['headline'].str.contains('Musk'))])
doesnt = len(df[(df['similarity'] > cutoff) & (~df['headline'].str.contains('Musk'))])
print("Contains:", contains)
print("Doesn't:", doesnt)
ratio = contains / (contains + doesnt)
print("Ratio:", ratio)


In [51]:
import numpy as np
tfidf.get_feature_names_out()

In [116]:
from sklearn.feature_extraction.text import CountVectorizer

def xkeyscore(representations, x):
    cv = CountVectorizer(
        tokenizer=lambda x: prepare(x, pipeline),
        max_features=20,
        ngram_range=(1, 3),
        lowercase=False
    ).fit(representations)
    xdt = cv.transform(x).todense()
    top_indices = np.argsort(np.sum(xdt, axis=1).A1)[-20:]
    return sum(xdt[0, i] for i in top_indices if xdt[0, i] > 0)

# def calculate_xkeyscore(df):
#     n_features = 1000
#     df['prepared'] = df['title'].apply(lambda x: prepare(x, pipeline=pipeline))
#     dense = CountVectorizer(max_features=n_features, ngram_range=(1, 3), lowercase=False).fit_transform(
#         df['prepared']
#     ).todense()
#     top_indices = np.argsort(np.sum(dense, axis=0).A1)[-n_features:]
#     df['score'] = [sum(doc[0, i] for i in top_indices if doc[0, i] > 0) for doc in dense]
#     df = df.sort_values(by='score', ascending=False)
#     df.drop('prepared', axis=1, inplace=True)
#     return df

In [117]:
samples = df[df['headline'].str.contains('Musk')].iloc[:5]['headline']
samples

In [118]:
df['xkeyscore'] = xkeyscore(samples, df['headline'])
df = df.sort_values('xkeyscore', ascending=False)
df.head(10)

In [115]:
df