In [1]:
import pandas as pd
from app.models import Session, Headline, Article, Agency, Country
s = Session()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mas/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/mas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from datetime import datetime
# filter only from yesterday
headlines = s.query(Headline.title, Agency.name, Agency._bias)\
    .join(Headline.article).join(Article.agency)\
    .filter(Agency._country==Country.us.value,
            Headline.first_accessed > datetime(2023, 2, 20)).all()

raw = pd.DataFrame(headlines, columns=['title', 'agency', 'bias'])
print(raw.shape)
raw.head()

(36619, 3)


Unnamed: 0,title,agency,bias
0,Schweizer’s ‘Blood Money’ Rockets to #1 on Ama...,Breitbart,3
1,Who Can Win a Nobel Prize? by Yasmin Nair,Current Affairs,-2
2,"To Russia, With Love, From GOP",Crooks and Liars,-2
3,Al Capone's Miami property is on the market fo...,Business Insider,-1
4,Legal Analyst: It’s ‘Game Over’ if Fani Willis...,Breitbart,3


In [9]:
from app.pipelines import prepare, default_pipeline, remove_stop
print(default_pipeline)
# import stopwords
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
def strip_stop(text):
    return [word for word in text if word not in stopwords]
default_pipeline += [strip_stop]

[<function split_camelcase at 0x2883c1c60>, <function hyphenated_words at 0x2883a28e0>, <function quotation_marks at 0x2883c0cc0>, <function unicode at 0x2883c0e00>, <function whitespace at 0x2883c0ea0>, <function accents at 0x2883c1080>, <function brackets at 0x2883c1120>, <function punctuation at 0x2883c1260>, <method 'lower' of 'str' objects>, <function tokenize at 0x2883c19e0>, <function decontract at 0x2883c1a80>, functools.partial(<function remove_stop at 0x173632480>, stopwords='abcdefghijklmnopqrstuvwxyz'), <function strip_stop at 0x28836f6a0>, <function strip_stop at 0x174d4bba0>, <function strip_stop at 0x28862fb00>, <function strip_stop at 0x107943060>]


In [24]:
df = raw
df['lower'] = df.title.str.lower()
biden = df[df.lower.str.contains('biden')]
trump = df[df.lower.str.contains('trump')]
reduced = pd.concat([biden, trump])
len(reduced)
df = reduced
# drop lines with word 'item'
df = df[~df.lower.str.contains('item')]

In [10]:
df = raw
df['clean'] = df.title.apply(prepare, pipeline=default_pipeline)
df['cleaned'] = df.clean.map(lambda x: ' '.join(x))
df.head()

Unnamed: 0,title,agency,bias,clean,cleaned
0,Schweizer’s ‘Blood Money’ Rockets to #1 on Ama...,Breitbart,3,"[schweizer, blood, money, rockets, to, on, ama...",schweizer blood money rockets to on amazon mov...
1,Who Can Win a Nobel Prize? by Yasmin Nair,Current Affairs,-2,"[who, can, win, nobel, prize, by, yasmin, nair]",who can win nobel prize by yasmin nair
2,"To Russia, With Love, From GOP",Crooks and Liars,-2,"[to, russia, with, love, from, gop]",to russia with love from gop
3,Al Capone's Miami property is on the market fo...,Business Insider,-1,"[al, capone, miami, property, is, on, the, mar...",al capone miami property is on the market for ...
4,Legal Analyst: It’s ‘Game Over’ if Fani Willis...,Breitbart,3,"[legal, analyst, it, game, over, if, fani, wil...",legal analyst it game over if fani willis lied...


In [11]:
length = len(df)
length

36619

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=5, max_df=0.7, ngram_range=(1,3))
count_vector = cv.fit_transform(df.cleaned)
from sklearn.decomposition import LatentDirichletAllocation
# grid search cv
from sklearn.model_selection import GridSearchCV
portions = [0.3, 0.5, 0.75]
search_params = {'n_components': [int(p*length) for p in portions], 'learning_decay': [.5, .7, .9]}

In [13]:
def display_topics(model, feature_names, no_top_words):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1]
        print("Topic %02d" % topic)
        for i in range(no_top_words):
            print("    %s: %.2f" % (feature_names[largest[i]], word_vector[largest[i]]/total*100.0))
           

In [27]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
search = GridSearchCV(lda, param_grid=search_params, n_jobs=-1)
search.fit(count_vector)
best_lda = search.best_estimator_
display_topics(best_lda, cv.get_feature_names_out(), 10)

Topic 00
    trump: 4.60
    fraud: 1.84
    million: 1.54
    york: 1.32
    judge: 1.12
    donald: 1.11
    donald trump: 1.11
    pay: 1.06
    case: 0.89
    ruling: 0.80
Topic 01
    biden: 1.41
    trump: 1.25
    russian: 0.62
    fbi: 0.54
    election: 0.53
    president: 0.49
    hunter: 0.49
    hunter biden: 0.48
    donald trump: 0.47
    donald: 0.47
Topic 02
    trump: 4.87
    biden: 2.26
    navalny: 1.52
    haley: 1.02
    president: 0.98
    death: 0.98
    donald: 0.96
    donald trump: 0.95
    putin: 0.94
    russian: 0.70
Topic 03
    biden: 4.63
    trump: 3.73
    president: 1.26
    joe: 0.73
    donald: 0.62
    donald trump: 0.62
    joe biden: 0.62
    great: 0.59
    says: 0.55
    haley: 0.55
Topic 04
    biden: 1.60
    trump: 1.22
    president: 1.13
    haley: 0.62
    year: 0.60
    president biden: 0.60
    train: 0.57
    east: 0.54
    east palestine: 0.52
    palestine: 0.52


In [29]:
def show_articles(model, feature_names, no_top_words):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1]
        print("Topic %02d" % topic)
        for i in range(no_top_words):
            print("    %s: %.2f" % (feature_names[largest[i]], word_vector[largest[i]]/total*100.0))
            print("    ", df[df.topic==topic].title.iloc[i])

In [31]:
show_articles(best_lda, cv.get_feature_names_out(), 10)

Topic 00
    trump: 4.60


AttributeError: 'DataFrame' object has no attribute 'topic'

In [16]:
display_topics(lda, cv.get_feature_names_out(), 10)

Topic 00
    biden: 3.24
    willis: 0.83
    fani: 0.81
    fani willis: 0.81
    donald: 0.80
    donald trump: 0.79
    new: 0.70
    house: 0.59
    day: 0.56
    not: 0.56
Topic 01
    ago: 1.14
    president: 1.11
    donald trump: 0.98
    donald: 0.98
    biden: 0.86
    million: 0.83
    former: 0.65
    former president: 0.62
    president donald: 0.52
    president donald trump: 0.52
Topic 02
    fraud: 1.99
    new: 1.50
    new york: 1.33
    york: 1.33
    million: 1.08
    ruling: 0.92
    judge: 0.87
    business: 0.86
    civil: 0.85
    pay: 0.85
Topic 03
    biden: 3.73
    president: 1.32
    haley: 0.79
    navalny: 0.79
    donald: 0.77
    donald trump: 0.77
    putin: 0.74
    death: 0.66
    joe: 0.64
    ago: 0.58
Topic 04
    biden: 1.35
    president: 1.05
    ago: 0.99
    russian: 0.91
    navalny: 0.63
    former: 0.54
    friday: 0.49
    derailment: 0.47
    ohio: 0.46
    wisconsin: 0.44


In [17]:
# how to see topic assignments
topic_assignments = lda.transform(count_vector)

In [18]:
df['topic'] = topic_assignments.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['topic'] = topic_assignments.argmax(axis=1)


In [19]:
df.head()

Unnamed: 0,title,lower,clean,cleaned,topic
28,Biden Turns the Screws on MAGA-Republicans ove...,biden turns the screws on maga-republicans ove...,"[biden, turns, screws, maga, republicans, pres...",biden turns screws maga republicans press conf...,3
47,"Biden blames businesses for ‘shrinkflation,’ b...","biden blames businesses for ‘shrinkflation,’ b...","[biden, blames, businesses, shrinkflation, for...",biden blames businesses shrinkflation forgetti...,4
55,A stunning indictment could doom Republican's ...,a stunning indictment could doom republican's ...,"[stunning, indictment, could, doom, republican...",stunning indictment could doom republican effo...,2
72,"Putin: Biden Better for Russia Than Trump, 'Mo...","putin: biden better for russia than trump, 'mo...","[putin, biden, better, russia, trump, predicta...",putin biden better russia trump predictable,1
77,An FBI informant is charged with lying about J...,an fbi informant is charged with lying about j...,"[fbi, informant, charged, lying, joe, hunter, ...",fbi informant charged lying joe hunter biden t...,0
