In [1]:
from app.models import Session, Headline, Article, Agency
from app.queries import Queries
s = Session()

articles = Queries.get_todays_articles(s).all()
articles


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mas/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/mas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[Article(id=276879, agency='CNBC', url='https://www.cnbc.com/2024/03/22/nbc-paris-olympics-opening-ceremony-to-play-on-imax.html'),
 Article(id=276883, agency='Fox Business', url='https://www.foxbusiness.com/markets/truth-social-verge-stock-market-approval-potentially-netting-trump-billions'),
 Article(id=276885, agency='Caixin Global', url='https://www.caixinglobal.com/2024-03-22/former-yoozoo-executive-sentenced-to-death-for-fatally-poisoning-chairman-102178384.html'),
 Article(id=276886, agency='NDTV', url='https://www.ndtv.com/opinion/elections-2024-will-parties-walk-the-talk-on-womens-representation-5290579#pfrom=home-ndtv_topstories'),
 Article(id=276894, agency='CNN', url='https://www.cnn.com/2024/03/21/business/frozen-russian-assets-ukraine-war/index.html'),
 Article(id=276911, agency='CNBC', url='https://www.cnbc.com/2024/03/22/op-ed-following-a-routine-makes-it-easier-to-build-wealth-heres-how.html'),
 Article(id=276914, agency='Caixin Global', url='https://www.caixinglobal.c

In [2]:
import pandas as pd
df = pd.DataFrame([[a.most_recent_headline().title, a.last_accessed, a.agency.name] for a in articles], columns=['headline', 'last_accessed', 'agency'])
df.head()

Unnamed: 0,headline,last_accessed,agency
0,NBC's Paris Olympics opening ceremony will pla...,2024-03-23 16:43:13.661055,CNBC
1,Truth Social on verge of stock market approval...,2024-03-23 16:42:50.792606,Fox Business
2,Former Yoozoo Executive Sentenced to Death for...,2024-03-23 16:42:32.932079,Caixin Global
3,Opinion | Elections 2024: Will Parties Walk Th...,2024-03-23 16:43:15.668458,NDTV
4,EU leaders endorse plan to tap frozen Russian ...,2024-03-23 16:43:43.025706,CNN


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from app.analysis.pipelines import Pipelines, prepare, trem, tnorm, STOPWORDS

pipeline = [
    str.lower,
    Pipelines.split_camelcase,
    tnorm.hyphenated_words,
    tnorm.quotation_marks,
    tnorm.unicode,
    tnorm.whitespace,
    trem.accents,
    trem.brackets,
    trem.punctuation,
    Pipelines.tokenize,
    Pipelines.decontract,
    lambda x: Pipelines.remove_stop(x, STOPWORDS),
    # lambda x: ' '.join(x)
]

tfidf = TfidfVectorizer(
    tokenizer=lambda x: prepare(x, pipeline),
    max_features=1000
)
dt = tfidf.fit_transform(df['headline'])



In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
sim = cosine_similarity(dt, dt)
sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [8]:
simdf = pd.DataFrame(sim, columns=df['headline'], index=df['headline'])
simdf

headline,NBC's Paris Olympics opening ceremony will play on IMAX screens,"Truth Social on verge of stock market approval, potentially netting Trump billions",Former Yoozoo Executive Sentenced to Death for Fatally Poisoning Chairman,Opinion | Elections 2024: Will Parties Walk The Talk On Women's Representation?,"EU leaders endorse plan to tap frozen Russian assets to arm Ukraine Mar 22, 2024",Op-ed: Establish routines that support financial goals. Doing so can help you build wealth,Bank of America’s Staff Move Piles Pressure on Hong Kong Office Market,opinion content. Ukraine and the mind games of nuclear deterrence,Why March Madness is all about Caitlin Clark,Report: Three Ways Letitia James Could Collect $454M from Donald Trump,...,"Travis Kelce and Patrick Mahomes plan to celebrate Chiefs wins at their Kansas City steakhouse next year, reveals the quarterback - as he opens up on going into business with Taylor Swift's boyfriend","Miami Airport is forced into full ground stop amid tornado threat, as Ultra dance music festival is shuttered after Florida was hit by thunder, winds and torrential rain","Larsa Pippen's extreme makeover explained by celebrity injector - and where the Real Housewives of Miami star, 49, has gone WRONG with fillers: 'She's too full!",CCP’s Military Growth ‘Largely Funded’ by US: Ret. Navy Capt.,"Girl, eight, who won lockdown cancer battle after being cheered by Kate during her own fight with the disease is 'deeply saddened' by princess's diagnosis - as her mother says youngster 'will reach out' to royal",The Evanescence of Amber and Lilac,"BA, Virgin Atlantic and Air India cabin crew reveal their secret travel hacks, from doing a jellyfish impression to make turbulence less scary to why you should leave a shoe in a hotel safe. What's YOUR favourite tip? Vote in our poll...",81-Year-Old Mother Speaks Out After Son Targeted by CCP’s Repression of Faith,"Bizarre moment Apple accountant photographed other passenger's boarding passes at airport, then used one to board Delta flight that was full, until he was arrested after hiding in toilet",Royal Family makes sure the show goes on by switching the lights off: Buckingham Palace and Windsor Castle go dark this evening to mark Earth Hour after Kate's cancer news
headline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NBC's Paris Olympics opening ceremony will play on IMAX screens,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
"Truth Social on verge of stock market approval, potentially netting Trump billions",0.0,1.0,0.0,0.0,0.0,0.0,0.147375,0.000000,0.0,0.066471,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Former Yoozoo Executive Sentenced to Death for Fatally Poisoning Chairman,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Opinion | Elections 2024: Will Parties Walk The Talk On Women's Representation?,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.228026,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
"EU leaders endorse plan to tap frozen Russian assets to arm Ukraine Mar 22, 2024",0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.135474,0.0,0.000000,...,0.126378,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Evanescence of Amber and Lilac,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
"BA, Virgin Atlantic and Air India cabin crew reveal their secret travel hacks, from doing a jellyfish impression to make turbulence less scary to why you should leave a shoe in a hotel safe. What's YOUR favourite tip? Vote in our poll...",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
81-Year-Old Mother Speaks Out After Son Targeted by CCP’s Repression of Faith,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.112039,0.000000,0.000000,0.0,0.192633,0.0,0.0,1.0,0.0,0.0
"Bizarre moment Apple accountant photographed other passenger's boarding passes at airport, then used one to board Delta flight that was full, until he was arrested after hiding in toilet",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.203293,0.141485,0.0,0.000000,0.0,0.0,0.0,1.0,0.0


In [14]:
# for each headline, put the column name of the most similar headline < 1.0, excluding the identical headline, in a new column
simdf['most_similar'] = simdf.apply(lambda x: x[x < 1.0].idxmax(), axis=1)
simdf[['headline', 'most_similar']]

TypeError: '<' not supported between instances of 'str' and 'float'