In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

### Tried LSA and NMF with count vectorizer and tf-idf, decided on nmf with count vectorizer and four topics. Used that in bush_graphs to make graphs 

In [2]:
df=pd.read_csv('clean_abstract_bush.csv')

In [3]:
df

Unnamed: 0,abstract,date
0,peter marks analysis finds that commercials ru...,2000-01-01 05:00:00+00:00
1,the new york times the internet and political ...,2000-01-01 05:00:00+00:00
2,letter by mike fremont of rivers unlimited on ...,2000-01-02 05:00:00+00:00
3,presidential primary season is the most compet...,2000-01-02 05:00:00+00:00
4,editorial on various campaign proposals for us...,2000-01-02 05:00:00+00:00
...,...,...
40816,republicans have criticized her tweets but dem...,2021-02-25 20:56:39+00:00
40817,the disputes are reminiscent of the fight surr...,2021-02-26 00:12:38+00:00
40818,most presidents leave the white house and adop...,2021-02-27 17:00:07+00:00
40819,democracy an unassuming policy journal with an...,2021-02-28 22:02:50+00:00


#### Stopword list

In [4]:
stopword_list=stopwords.words('english')

In [5]:
stopword_list.extend(['could','many','even','also','make','whether','least','called','keep','said','says', 'say',
                      'george','bush',' w ','bushs','mr','pres','would','president','sen','us','united', 'states','american','americans',
                       'white','house','new','york','way','people','year','years','sec','state','photo'])

In [6]:
stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

#### Replace words with 'root'

In [7]:
df['abstract']=df['abstract'].astype(str)

In [8]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('iraqis','iraq'))

In [9]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('iraqi','iraq'))

In [10]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('democrats','democrat'))

In [11]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('republicans','republican'))

In [12]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('israeli','israel'))

In [13]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('al gore', 'gore'))

#### Count Vectorize

In [14]:
vectorizer = CountVectorizer(stop_words=stopword_list, ngram_range=(1, 1))

In [15]:
doc_word = vectorizer.fit_transform(df['abstract'].values.astype('U'))

In [16]:
doc_word.shape

(40821, 43546)

#### LSA

In [153]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.02193266, 0.01377563, 0.00964389, 0.00701274, 0.00644298])

In [155]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [156]:
display_topics(lsa, vectorizer.get_feature_names(), 15)


Topic  0
iraq, administration, war, republican, government, security, photo, officials, military, democrat, political, congress, campaign, national, one

Topic  1
iraq, war, forces, troops, hussein, saddam, baghdad, weapons, military, government, enemy, iraqs, world, know, im

Topic  2
republican, iraq, democrat, presidential, campaign, democratic, john, senate, party, gov, kerry, mccain, election, gore, war

Topic  3
tax, billion, cuts, budget, spending, congress, cut, federal, percent, bill, plan, taxes, kerry, health, money

Topic  4
israel, palestinian, tax, prime, sharon, min, peace, arafat, plan, palestinians, security, billion, cuts, israels, ariel


#### NMF - model generates topics

In [17]:
nmf_model = NMF(4, max_iter=300)
doc_topic = nmf_model.fit_transform(doc_word)

In [18]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [19]:
display_topics(nmf_model, vectorizer.get_feature_names(), 25)


Topic  0
iraq, war, government, security, military, troops, forces, hussein, saddam, world, weapons, baghdad, country, help, time, leaders, enemy, support, al, one, terrorists, attacks, know, strategy, iraqs

Topic  1
republican, democrat, presidential, campaign, john, senate, democratic, gov, party, election, gore, bill, political, court, vote, vice, mccain, voters, florida, national, candidates, votes, kerry, one, committee

Topic  2
administration, officials, israel, nuclear, north, military, palestinian, government, nations, security, korea, national, federal, washington, weapons, intelligence, attacks, policy, two, international, program, plan, prime, administrations, first

Topic  3
tax, billion, cuts, budget, spending, percent, congress, plan, federal, cut, bill, security, kerry, social, money, taxes, health, senate, million, next, economic, programs, deficit, government, increase


#### TF-IDF

In [160]:
cv_tfidf = TfidfVectorizer(stop_words=stopword_list)

In [161]:
X_tfidf = cv_tfidf.fit_transform(df['abstract'].values.astype('U'))

#### LSA with TF-IDF

In [162]:
lsa_2 = TruncatedSVD(5)

In [163]:
doc_topic = lsa_2.fit_transform(X_tfidf)

In [164]:
lsa_2.explained_variance_ratio_

array([0.01652293, 0.01034366, 0.00198412, 0.00342278, 0.00263017])

In [165]:
display_topics(lsa_2, cv_tfidf.get_feature_names(), 5)


Topic  0
nan, kimpaik, letter, hayworth, enrons

Topic  1
editor, times, iraq, administration, republican

Topic  2
iraq, republican, administration, war, democrat

Topic  3
iraq, war, military, administration, weapons

Topic  4
iraq, presidential, republican, war, campaign


#### NMF with TF-IDF

In [169]:
nmf_model_2 = NMF(5, max_iter=500)


In [170]:
doc_topic = nmf_model_2.fit_transform(X_tfidf)



In [171]:
display_topics(nmf_model_2, cv_tfidf.get_feature_names(), 5)


Topic  0
nan, kimpaik, hayworth, letter, enrons

Topic  1
editor, times, readers, magazine, timess

Topic  2
republican, presidential, campaign, john, gov

Topic  3
iraq, war, military, administration, weapons

Topic  4
tax, administration, congress, billion, federal
