In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

### Tried LSA and NMF with count vectorizer and tf-idf, decided on nmf with count vectorizer and three topics. Used that in clinton_graphs to make graphs 

In [2]:
df=pd.read_csv('clean_abstract_clinton')

In [4]:
df

Unnamed: 0,abstract,date
0,pres and mrs clinton hold millennium party at ...,2000-01-02 05:00:00+00:00
1,for much of his presidency bill clintons own a...,2000-01-03 05:00:00+00:00
2,as the first of two moving trucks turned onto ...,2000-01-05 05:00:00+00:00
3,it is a reality of modern campaigns that conte...,2000-01-05 05:00:00+00:00
4,clinton pushes peace talks president clinton ...,2000-01-05 05:00:00+00:00
...,...,...
16436,a new gender policy council will look differen...,2021-02-16 18:27:12+00:00
16437,nearly three decades after the white house est...,2021-02-16 18:27:23+00:00
16438,with a following of million and a divisive st...,2021-02-17 17:35:38+00:00
16439,rush limbaugh made the gop the party of misogyny,2021-02-20 11:55:04+00:00


#### Stopword list

In [5]:
stopword_list=stopwords.words('english')

In [6]:
stopword_list.extend(['could','many','even','also','make','whether','least','called','keep','said','says', 'say',
                      'hillary','clinton','clintons','rodham','mr','ms','mrs','would','us','united', 'states',
                      'way','people','year','years','new','two','three','first','day','white', 'house','one',
                     'business','international','national','bulk','receiving','orders','among','photo','photos',
                     'monday','tuesday','wednesday','thursday','friday','saturday','sunday','news',
                      'bookstores','bookstore','indicates','expanded','include','billion','million',
                      'next','since','last','court','time','may','group','week', 'weather', 'editorial', 'yorkregion', 'obituaries', 'editorials'])

In [7]:
stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
df['abstract']=df['abstract'].astype(str)

In [9]:
df['date']=pd.to_datetime(df['date'])

#### Replace words with 'root'

In [10]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('iraqis','iraq'))

In [11]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('iraqi','iraq'))

In [12]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('democrats','democrat'))

In [13]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('democratic','democrat'))

In [14]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('republicans','republican'))

In [15]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('israeli','israel'))

In [16]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('barack obama','obama'))

In [17]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('obamas','obama'))

In [18]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('george w bush','bush'))

In [19]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('president bush','bush'))

In [20]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('york city','yorkcity'))

In [21]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('york state','yorkstate'))

In [22]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('john mccain','mccain'))

In [23]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('bernie sanders','sanders'))

In [24]:
df['abstract']=df['abstract'].apply(lambda x: x.replace(' sen ','senator'))

In [25]:
df['abstract']=df['abstract'].apply(lambda x: x.replace(' pres ','president'))

In [26]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('john kerry','kerry'))

In [27]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('john podesta','podesta'))

In [28]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('candidates','candidate'))

In [29]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('partys','party'))

In [30]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('voters','vote'))

In [31]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('voter','vote'))

In [32]:
df['abstract']=df['abstract'].apply(lambda x: x.replace(' gov ','governor'))

In [33]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('weeks','week'))

In [34]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('rudolph giuliani','giuliani'))

In [35]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('rudolph','giuliani'))

In [36]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('rick lazio','lazio'))

In [37]:
df['abstract']=df['abstract'].apply(lambda x: x.replace(' rep ','representative'))

In [38]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('representatives','representative'))

In [39]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('running','run'))

In [40]:
df['abstract']=df['abstract'].apply(lambda x: x.replace('run','race'))

#### Count Vectorize

In [41]:
vectorizer = CountVectorizer(stop_words=stopword_list, ngram_range=(1, 1))

In [42]:
doc_word = vectorizer.fit_transform(df['abstract'].values.astype('U'))

In [43]:
doc_word.shape

(16441, 26332)

#### LSA

In [112]:
lsa = TruncatedSVD(20)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.02205605, 0.0143874 , 0.01123751, 0.0094156 , 0.00809186,
       0.00751981, 0.00691936, 0.00655352, 0.00633107, 0.00598747,
       0.00541211, 0.00522674, 0.00479066, 0.00454533, 0.00443562,
       0.00427491, 0.00411352, 0.00401005, 0.00388321, 0.00362212])

In [113]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [114]:
display_topics(lsa, vectorizer.get_feature_names(), 20)


Topic  0
democrat, senator, republican, campaign, senate, race, bush, president, candidate, iraq, state, york, party, obama, presidential, vote, political, american, former, officials

Topic  1
iraq, bush, american, government, federal, officials, administration, security, war, state, nations, world, oil, president, plan, israel, nuclear, foreign, public, killed

Topic  2
democrat, republican, senate, candidate, party, york, giuliani, lazio, race, mayor, money, pataki, committee, vote, yorkcity, federal, political, city, bush, yorkstate

Topic  3
democrat, iraq, war, american, party, bush, nomination, oil, need, security, convention, troops, world, foreign, face, administration, must, government, interests, policy

Topic  4
senator, iraq, campaign, war, york, american, political, oil, need, government, world, interests, face, presidential, bush, values, must, page, troops, institutions

Topic  5
campaign, president, iraq, presidential, obama, war, candidate, political, american, need,

#### NMF - model generates topics

In [130]:
nmf_model = NMF(3, max_iter=300)
doc_topic = nmf_model.fit_transform(doc_word)

In [131]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [132]:
display_topics(nmf_model, vectorizer.get_feature_names(), 35)


Topic  0
democrat, republican, candidate, campaign, party, race, senate, presidential, york, vote, political, nomination, giuliani, primary, money, lazio, state, mayor, committee, election, convention, pataki, debate, john, governor, support, pirro, general, attorney, reelection, yorkstate, fundraising, former, leaders, bill

Topic  1
iraq, bush, american, state, federal, officials, president, government, administration, security, war, former, nations, plan, york, political, senate, republican, public, israel, world, vote, oil, secretary, health, attacks, killed, foreign, city, iran, nuclear, congress, military, north, chief

Topic  2
senator, obama, president, campaign, former, presidential, washington, race, mccain, bill, primary, secretary, supporters, state, nomination, sanders, vote, made, night, pennsylvania, fight, support, victory, might, carolina, speech, hampshire, rival, advisers, iowa, vice, primaries, presidentelect, candidacy, convention


In [134]:
doc_topic = pd.DataFrame(doc_topic.round(3),
             columns = ["component_1","component_2","component_3"])
doc_topic

Unnamed: 0,component_1,component_2,component_3
0,0.027,0.009,0.000
1,0.028,0.052,0.024
2,0.000,0.047,0.027
3,0.132,0.148,0.036
4,0.000,0.063,0.030
...,...,...,...
16436,0.000,0.034,0.029
16437,0.003,0.053,0.013
16438,0.000,0.031,0.000
16439,0.022,0.004,0.000


### Look at abstracts to decide on the right topic, concatenate with original abstracts and look at the abstracts with the highest value to decide on toipcs

In [135]:
df_doc_topic=pd.concat([df,doc_topic], axis=1)

#### Abstracts from Topic 1

In [136]:
df_doc_topic['component_1'].sort_values(ascending=False)

1180     0.765
13       0.646
1587     0.580
1858     0.560
532      0.557
         ...  
6549     0.000
6551     0.000
6553     0.000
6554     0.000
16440    0.000
Name: component_1, Length: 16441, dtype: float64

In [137]:
df_doc_topic.iloc[1180]['abstract']

'senate democrat raised more unlimited unregulated soft money than republican did in  election cycle democrat used new fundraising technique called joint fundraising committee to attract soft money donations to support party and its goals rather than candidate such committees enable candidate to raise soft money indirectly  democrat senate candidate who used joint committees raised more than  million in soft money vastly exceeding  million raised by seven republican senate candidate who used committees hillary rodham clinton used joint committee to raise as much as  million in soft money figures are in new report by common cause campaign finance watchdog group figures put democrat into awkward position as senate prepares to take up campaign finance legislation democrat have long supported overhaul that would ban soft money now some are leery of tinkering with success photo'

In [138]:
df_doc_topic.iloc[13]['abstract']

'prominent democrat worry that they will lose white house because of republican huge war chest fear that republican national committee with surplus of unregulated soft money would sponsor relentless blizzard of television commercials condemning democrat programs and democrat presidential nominee money worries are stirring resentment towardpresidentclinton some feel clinton is neglecting his party distracted by his efforts to collect money for his library his wifes senate campaign and his legal defense fund some democrat also worry about their house and senate candidatesenatorrobert g torricelli chairman of democrat senatorial campaign committee claims republican could have three or fourtoone advantage possibly amassing as much as  million in soft money republican officials call that figure overblown'

In [139]:
df_doc_topic.iloc[1587]['abstract']

'growing sense of realignment in voting trends in new yorkstate over last four years is forcing gubernatorial candidate and their advisers to reconsider approach to  campaign incumbent republicangovernorgeorge e pataki and potential democrat opponents h carl mccall and andrew cuomo are seizing on shift to highlight changes they say benefit their party democrat note sharp rise in party registration in recent years and strong democrat showings recently in upstate counties and on long island traditionally republican strongholds pataki aides note his success in picking up support of labor unions hispanic leaders and other democrat figures while contest is still in early stages it is democrat who seem particularly concerned as full picture of scrambling of political allegiances come into focus change is notable compared with six months ago when many party stalwarts voiced confidence over prospects of defeating pataki cheered by backtoback victories in us senate contests photos'

In [140]:
df_doc_topic.iloc[1858]['abstract']

'sen hillary rodham clinton is democrat party single best fundraising draw the only one other than her husband who can pack a room she is acting as source of advice to candidate and partisan cheerleader in what democrat see as increasingly tough election she has campaigned or held fundraisers on behalf of more than  democrat candidate for house and senate who are in close races as well as for three washingtonbased democrat campaign committees she has donated  million from her own political action committee out of  million she has raised to candidate of her choice across nation her washington home has become conveyor belt of fundraising dinners and receptions that democrat candidate clamor to climb aboard photo'

In [141]:
df_doc_topic.iloc[532]['abstract']

'new york primary races generally draw light turnout producing decisive victories for several incumbent members of congress and most incumbents in state legislaturerepresentativemichael p forbes republican turned democrat says his primary race against regina seltzer for his house seat from long island is too close to call interview hillary rodham clinton easily defeats dr mark s mcmahon for democrat nomination for united states senate votes withpresidentclinton in chappaquarepresentativeeliot l engel survives challenge from statesenatorlarry b seabrook in bronxrepresentativemajor r owens withstands challenge from councilwoman una clarke in brooklynrepresentativeanthony d weiner wins resounding victory over councilman noach dear in brooklynrepresentativeedolphus towns wins renomination over barry ford jr steve j israel narrowly wins democrat nomination forrepresentativelazios house seat from long island will face joan b johnson who wins republican primary most incumbents also survive in

#### Abstracts from Topic 2

In [142]:
df_doc_topic['component_2'].sort_values(ascending=False)

3922     3.384
3104     2.073
2880     1.960
3204     1.884
2758     1.761
         ...  
11754    0.000
11761    0.000
11764    0.000
4503     0.000
8220     0.000
Name: component_2, Length: 16441, dtype: float64

In [104]:
df_doc_topic.iloc[3922]['abstract']

'following is the transcript of senator hillary rodham clintons remarks at the council on foreign relations as provided by cq transcriptions incclinton you all know the litany of threats and challenges the metastasizing threat of terrorists networks recruiting troops setting up training camps amassing weapons a regime in north korea openly testing missiles and nuclear weapons an activist expansionist iran pursuing its own nuclear arsenal a resurgent taliban in afghanistanand an emerging civil war in iraq russia and china pursuing their own interests often at odds with such global imperatives as nuclear nonproliferation and ending genocide in darfur oil has never been more important in funding unstable anti american governments and yet we have failed to make the investments necessary to move more rapidly to alternative fuels a policy that is now as important to our national security and our mideast strategy as to our economy and environment the lost opportunities of the years since sept

In [105]:
df_doc_topic.iloc[3104]['abstract']

'international  a    israel gives ultimatum on palestinian vote  prime minister ariel sharon of israel vowed to withhold israel cooperation from palestinian legislative elections scheduled for january if candidate from the militant group hamas take part  a    katrina stirs up iraq concerns  with hurricane katrina already costing the federal government tens of billions of dollars more than  in  americans are very or somewhat concerned that the  billion being spent each month on the war in iraq is draining away money that could be used in the united states according to the latest new york timescbs news poll  a    the rash of car bombings in baghdad this week has underscored how the loosely knit and elusive networks of abu musab alzarqawi and other extremists can still recruit discontented iraq and foreign fighters to launch wellcoordinated attacks  a    insurgents staged a series of suicide bombings and ambushes that left at least  people dead across iraq including an attack on a crowd o

In [143]:
df_doc_topic.iloc[2880]['abstract']

'international   a    reeducation campspose problem for china  a vast penal system in china that is separate from the judicial system and is a relic of the mao era is presenting a dilemma for a modernday communist party that remains obsessed with security and political control   a    sunni refuses cabinet post  one of four sunni arabs picked over the weekend to join iraqs new shiitecontrolled cabinet abruptly rejected the job saying that he learned of his selection from a television news report and adding that he felt it would further a quota system for sunnis that would only make sectarian problems worse   a    insurgents in iraq are drawing on dozens of stockpiled bombrigged cars and groups of foreign fighters smuggled into the country in recent week to carry out most of the suicide attacks that have killed about  people in the last  days senior american officers say   a    bush meets with putin  bush met with president vladimir v putin of russia in what was expected to be a tense en

In [144]:
df_doc_topic.iloc[3204]['abstract']

'international   a    uskyrgyzstan fuel deal under scrutiny by fbi  two businesses from which the united states bought jet fuel after the american invasion of afghanistan in  are under scrutiny by prosecutors in kyrgyzstan and fbi agents who are looking into whether the kyrgyz president at the time pocketed hundreds of millions of dollars partly from pentagon fuel contracts before he was ousted this year   a    a possible motive in jordan  jordanian investigators said the iraq woman they said had taken part in the deadly amman hotel terror attacks had volunteered to become a suicide bomber because three of her brothers were killed during operations in iraq   a    marines killed in iraq offensive  two marines were killed and at least nine wounded in ambushes and fierce street battles as thousands of american and iraq troops stormed ubaydi a riverside town near the syrian border that american commanders say has become a haven for foreign jihadists   a    prime minister tony blair said it

#### Abstracts from Topic 3

In [145]:
df_doc_topic['component_3'].sort_values(ascending=False)

3913    0.499
6568    0.475
7613    0.466
8050    0.451
3248    0.445
        ...  
9117    0.000
9118    0.000
9120    0.000
9121    0.000
0       0.000
Name: component_3, Length: 16441, dtype: float64

In [146]:
df_doc_topic.iloc[3913]['abstract']

'i might    the news  will he or wont he his answer has always been a seemingly decided wont but last sunday obama said he would consider considering a race for the presidency in     behind the news  speculation has tracked the freshman senator from illinois since his speech at the democrat national convention in  mr obama has routinely deflected such speculation saying that he would finish his term before acting on any further ambitions but on the nbc program meet the press last week he said i dont want to be coy about this given the responses that ive been getting over the last several months i have thought about the possibility but i have not thought about it with the seriousness and depth that i think is required  the shift puts mr obama squarely in the path of senator hillary rodham clinton of new york    i didnt    the news  the iraq prime minister nuri kamal almaliki said on wednesday that he had not agreed to a timetable for stabilizing iraq    behind the news  a day earlier th

In [147]:
df_doc_topic.iloc[6568]['abstract']

'dallas  senator hillary rodham clinton escalated her attack on senator obama qualifications to be president on saturday arguing that both she and the presumptive republican nominee senator mccain could offer vote a lifetime of experience while obama could only hold out his  speech opposing the iraq war'

In [148]:
df_doc_topic.iloc[7613]['abstract']

'bristol virginia  senator hillary rodham clinton on thursday disavowed a campaign by some of her supporters to press senator obama into selecting her as the democrat party vice presidential candidate clinton said the supporters were acting on their own and that the choice of a race mate was senator obama and his alone'

In [149]:
df_doc_topic.iloc[8050]['abstract']

'st paul  senator obama will increasingly lean on prominent democrat women to undercut governor sarah palin and senator mccain dispatching senator hillary rodham clinton to florida on monday and creating a rapidresponse team to deploy female surrogates to battleground states obama advisers said on thursday'

In [124]:
df_doc_topic.iloc[3913]['abstract']

'i might    the news  will he or wont he his answer has always been a seemingly decided wont but last sunday obama said he would consider considering a race for the presidency in     behind the news  speculation has tracked the freshman senator from illinois since his speech at the democrat national convention in  mr obama has routinely deflected such speculation saying that he would finish his term before acting on any further ambitions but on the nbc program meet the press last week he said i dont want to be coy about this given the responses that ive been getting over the last several months i have thought about the possibility but i have not thought about it with the seriousness and depth that i think is required  the shift puts mr obama squarely in the path of senator hillary rodham clinton of new york    i didnt    the news  the iraq prime minister nuri kamal almaliki said on wednesday that he had not agreed to a timetable for stabilizing iraq    behind the news  a day earlier th

In [125]:
df_doc_topic['component_4'].sort_values(ascending=False)

3239     1.276
2908     1.075
3238     1.025
319      0.748
91       0.721
         ...  
5451     0.000
10560    0.000
10559    0.000
5455     0.000
8220     0.000
Name: component_4, Length: 16441, dtype: float64

In [126]:
df_doc_topic.iloc[3239]['abstract']

'jeanine f pirro the westchester county district attorney has been race for the united states senate since august but her candidacy has been dogged by setbacks    may  ms pirro announces she will not seek reelection as district attorney a move that raises speculation about whether she will race for governor or for the senate seat held by hillary rodham clinton    aug  prodded by new york republican leaders and encouraged by some white house officials ms pirro announces her senate bid    aug  officially opening her campaign with news conferences throughout the state ms pirro stumbles pausing awkwardly on live television she also fails to answer specific policy questions and refuses to say whether she will take campaign money from her husband who spent time in prison on a tax conviction    oct  the pirro campaign trying to raise  million to challenge reports that it has collected slightly more than  since ms pirro announced her candidacy    nov  state senator joseph l braceo the most pow

In [127]:
df_doc_topic.iloc[2908]['abstract']

'international   a    syria ending cooperation with us its envoy says  syria has halted military and intelligence cooperation with the united states its ambassador to washington said in an interview in a sign of growing strains between the two countries over the insurgency in iraq   a    karzai talks with bush  afghanistans president hamid karzai raised with bush the treatment of afghan prisoners held by the united states but mr   bush did not commit to when he would be willing to give the kabul government control over prisoners taken by the american military   a    car bombs kill  in iraq  insurgents carried out three car bomb attacks against iraq shiites killing at least  people and wounding  in what appeared to be the latest in a wave of violence intended to exploit the sectarian divisions that have damaged the country   a    election order in iran disputed  ayatollah ali khamenei irans supreme religious leader ordered a watchdog group dominated by his hardline allies to review its 

In [128]:
df_doc_topic.iloc[3238]['abstract']

'new york republican party leaders reverse course urging jeanine f pirro to quit her bid to unseatsenatorhillary rodham clinton and race for state attorney general pirro immediately rejects suggestion shift by party leaders six months after they recruited pirro to race for senate signals mounting disarray of state party as it faces possible loss of governors mansion and its majority in state senate in  if as expected pirro does pull out of senate race party will be left without highprofile challenger to clinton as she seeks commanding reelection victory decision to urge pirro to withdraw from senate race comes after public bickering over her mistakeridden campaign party leaders hope she can save her career and reputation by race for state attorney general other republican leaders warn that they would consider other possible candidate for attorney general if she does not switch races by end of year advisers to clinton are delighted that race so far has centered on pirros foibles and gaf

In [129]:
df_doc_topic.iloc[319]['abstract']

'new york timescbs news poll finds despite change of opponent steady diet of television advertising and vigorous schedule of trips around new yorkstate hillary rodham clintons campaign for us senate appears largely stuck in place finds clinton locked in close race with republican repr lazio opponent so unknown that most new yorkers are unable to offer any opinion of him finds his replacement for mayor giuliani as republican candidate has scrambled dynamics of race clinton is swamping lazio in new yorkcity as notable number of giuliani supporters have switched reverting to historical party voting habits conversely lazio who is more conventional statewide republican candidate than giuliani seems to be blunting clintons usually competitive position in upstate new york which usually favors republican by large numbers findings of poll detailed charts'

#### TF-IDF

In [1073]:
cv_tfidf = TfidfVectorizer(stop_words=stopword_list)

In [1074]:
X_tfidf = cv_tfidf.fit_transform(df['abstract'].values.astype('U'))

#### LSA with TF-IDF

In [1076]:
lsa_2 = TruncatedSVD(2)

In [1077]:
doc_topic = lsa_2.fit_transform(X_tfidf)

In [1078]:
lsa_2.explained_variance_ratio_

array([0.00272199, 0.00706746])

In [1079]:
display_topics(lsa_2, cv_tfidf.get_feature_names(), 10)


Topic  0
senator, democrat, obama, campaign, presidential, candidate, president, republican, state, york

Topic  1
editor, marthas, readers, times, magazine, timess, reporters, political, cheney, autism


#### NMF with TF-IDF

In [1080]:
nmf_model_2 = NMF(2)

In [1081]:
doc_topic = nmf_model_2.fit_transform(X_tfidf)

In [1082]:
display_topics(nmf_model_2, cv_tfidf.get_feature_names(), 10)


Topic  0
senator, democrat, obama, campaign, presidential, candidate, president, republican, state, york

Topic  1
editor, marthas, political, times, readers, magazine, reporters, timess, public, cheney
