In [110]:
import pandas as pd 
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import nltk
#nltk.download('wordnet')

### Read cleaned abstracts and combine. Then clean some more. 

In [111]:
df_clinton=pd.read_csv('clean_abstract_clinton')

In [112]:
df_bush=pd.read_csv('clean_abstract_bush.csv')

In [113]:
df_clinton

Unnamed: 0,abstract,date
0,pres and mrs clinton hold millennium party at ...,2000-01-02 05:00:00+00:00
1,for much of his presidency bill clintons own a...,2000-01-03 05:00:00+00:00
2,as the first of two moving trucks turned onto ...,2000-01-05 05:00:00+00:00
3,it is a reality of modern campaigns that conte...,2000-01-05 05:00:00+00:00
4,clinton pushes peace talks president clinton ...,2000-01-05 05:00:00+00:00
...,...,...
16436,a new gender policy council will look differen...,2021-02-16 18:27:12+00:00
16437,nearly three decades after the white house est...,2021-02-16 18:27:23+00:00
16438,with a following of million and a divisive st...,2021-02-17 17:35:38+00:00
16439,rush limbaugh made the gop the party of misogyny,2021-02-20 11:55:04+00:00


In [114]:
df_bush

Unnamed: 0,abstract,date
0,peter marks analysis finds that commercials ru...,2000-01-01 05:00:00+00:00
1,the new york times the internet and political ...,2000-01-01 05:00:00+00:00
2,letter by mike fremont of rivers unlimited on ...,2000-01-02 05:00:00+00:00
3,presidential primary season is the most compet...,2000-01-02 05:00:00+00:00
4,editorial on various campaign proposals for us...,2000-01-02 05:00:00+00:00
...,...,...
40816,republicans have criticized her tweets but dem...,2021-02-25 20:56:39+00:00
40817,the disputes are reminiscent of the fight surr...,2021-02-26 00:12:38+00:00
40818,most presidents leave the white house and adop...,2021-02-27 17:00:07+00:00
40819,democracy an unassuming policy journal with an...,2021-02-28 22:02:50+00:00


In [115]:
df=df_bush.append(df_clinton, ignore_index=True)

In [116]:
df

Unnamed: 0,abstract,date
0,peter marks analysis finds that commercials ru...,2000-01-01 05:00:00+00:00
1,the new york times the internet and political ...,2000-01-01 05:00:00+00:00
2,letter by mike fremont of rivers unlimited on ...,2000-01-02 05:00:00+00:00
3,presidential primary season is the most compet...,2000-01-02 05:00:00+00:00
4,editorial on various campaign proposals for us...,2000-01-02 05:00:00+00:00
...,...,...
57257,a new gender policy council will look differen...,2021-02-16 18:27:12+00:00
57258,nearly three decades after the white house est...,2021-02-16 18:27:23+00:00
57259,with a following of million and a divisive st...,2021-02-17 17:35:38+00:00
57260,rush limbaugh made the gop the party of misogyny,2021-02-20 11:55:04+00:00


### Convert to string needed after reading from csv

In [117]:
df['abstract']=df['abstract'].astype(str)

### Create stopword list and add to it 

In [118]:
stopword_list=stopwords.words('english')

In [119]:
stopword_list.extend(['photo','monday','tuesday','wednesday','thursday','friday','saturday','sunday','rodham','im','theyre','youre','shes','who','wasnt','whom'])

In [120]:
stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### Remove stopwords from each abstract

In [121]:
df['abstract']=df['abstract'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopword_list]))

### Lemmatize each word in each abstract 

In [122]:
lemmatizer=WordNetLemmatizer()

In [123]:
df['abstract']=df['abstract'].apply(lambda x: " ".join([lemmatizer.lemmatize(item) for item in x.split()]))

### Remove words with length 1 (middle initials)

In [124]:
df['abstract']=df['abstract'].apply(lambda x: ' '.join([item for item in x.split() if len(item)>1]))

In [125]:
df.to_csv('lemma_no_stop_abstract.csv', index=False, header=True)