In [1]:
import pandas as pd
from app.models import Session, Headline, Article, Agency, Country
s = Session()

In [3]:
from datetime import datetime
# filter only from yesterday
headlines = s.query(Headline.title, Agency.name, Agency._bias)\
    .join(Headline.article).join(Article.agency)\
    .filter(Agency._country==Country.us.value,
            Headline.first_accessed > datetime(2023, 2, 20)).all()

raw = pd.DataFrame(headlines, columns=['title', 'agency', 'bias'])
print(raw.shape)
raw.head()

In [9]:
from app.pipelines import prepare, default_pipeline, remove_stop
print(default_pipeline)
# import stopwords
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
def strip_stop(text):
    return [word for word in text if word not in stopwords]
default_pipeline += [strip_stop]

In [24]:
df = raw
df['lower'] = df.title.str.lower()
biden = df[df.lower.str.contains('biden')]
trump = df[df.lower.str.contains('trump')]
reduced = pd.concat([biden, trump])
len(reduced)
df = reduced
# drop lines with word 'item'
df = df[~df.lower.str.contains('item')]

In [10]:
df = raw
df['clean'] = df.title.apply(prepare, pipeline=default_pipeline)
df['cleaned'] = df.clean.map(lambda x: ' '.join(x))
df.head()

In [11]:
length = len(df)
length

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=5, max_df=0.7, ngram_range=(1,3))
count_vector = cv.fit_transform(df.cleaned)
from sklearn.decomposition import LatentDirichletAllocation
# grid search cv
from sklearn.model_selection import GridSearchCV
portions = [0.3, 0.5, 0.75]
search_params = {'n_components': [int(p*length) for p in portions], 'learning_decay': [.5, .7, .9]}

In [13]:
def display_topics(model, feature_names, no_top_words):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1]
        print("Topic %02d" % topic)
        for i in range(no_top_words):
            print("    %s: %.2f" % (feature_names[largest[i]], word_vector[largest[i]]/total*100.0))
           

In [27]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
search = GridSearchCV(lda, param_grid=search_params, n_jobs=-1)
search.fit(count_vector)
best_lda = search.best_estimator_
display_topics(best_lda, cv.get_feature_names_out(), 10)

In [29]:
def show_articles(model, feature_names, no_top_words):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1]
        print("Topic %02d" % topic)
        for i in range(no_top_words):
            print("    %s: %.2f" % (feature_names[largest[i]], word_vector[largest[i]]/total*100.0))
            print("    ", df[df.topic==topic].title.iloc[i])

In [31]:
show_articles(best_lda, cv.get_feature_names_out(), 10)

In [16]:
display_topics(lda, cv.get_feature_names_out(), 10)

In [17]:
# how to see topic assignments
topic_assignments = lda.transform(count_vector)

In [18]:
df['topic'] = topic_assignments.argmax(axis=1)

In [19]:
df.head()