### Challange: Use the senator speeches in the folder 105-extracted-date and use doc2vec to find whose senator speech is closest to senator Biden. Use sen105kh_fix.csv and/or Wikipedia to validate your findings (i.e., understand if the most similar speeches are senators from the same state and/party). Describe your findings. Compare with the outcome you got/will get using cosine similarity.

In [249]:
import pandas as pd
import os
import glob
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
import gensim
from gensim.models.doc2vec import TaggedDocument
import warnings
from collections import Counter
from gensim.models import Word2Vec

from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")
tqdm.pandas()

In [197]:
pwd

'c:\\Users\\99450\\Desktop\\ML-for-NLP-main\\Inputs\\105-extracted-date'

In [198]:
os.chdir('c:\\Users\\99450\\Desktop\\ML-for-NLP-main\\Inputs\\105-extracted-date')

In [199]:
inputdir = "c:\\Users\\99450\\Desktop\\ML-for-NLP-main\\Inputs\\"

In [200]:
def format_congress_text(text_list):
    df = pd.DataFrame(text_list, columns=["text_raw"])

    df["text_raw"] = df["text_raw"].str.split("\n</TEXT>\n</DOC>\n\n<DOC>\n<DOCNO>")

    df = df.explode("text_raw")

    df["text_raw"] = df["text_raw"].str.replace("<DOC>\n<DOCNO>", "")

    df["meta"] = df["text_raw"].str.split("</DOCNO>\n<TEXT>\n").str[0]
    df["congress"] = df["meta"].str[:3]
    df["speaker"] = df["meta"].str.split("-").str[1]
    df["state"] = df["meta"].str.split("-").str[2]
    df["date"] = df["meta"].str.split("-").str[4]

    df["text_raw"] = (
        df["text_raw"]
        .str.split("</DOCNO>\n<TEXT>\n")
        .str[1]
        .str.strip()
        .str.split(".")
        .str[2:]
        .str.join(sep="")
        .str.strip()
    )

    df["text_raw"] = df["text_raw"] + " "

    # now join this back to congress / speaker level

    df = df.groupby(["congress", "speaker"])["text_raw"].sum().reset_index()

    # drop if a congressman did not speek
    df = df.loc[lambda x: x["text_raw"].apply(type) == str]
    return df

In [201]:
droplist = open(inputdir + "droplist.txt", encoding="utf-8", newline="\n").read()
droplist = [i.replace('"', "") for i in droplist.split("\n")]
stop_words = stopwords.words("english")
stopwords_final = droplist + stop_words

In [202]:
text_105 = [
    open(inputdir + "105-extracted-date/" + i, encoding="latin").read()
    for i in os.listdir(inputdir + "105-extracted-date/")
]

df = format_congress_text(text_105)

In [203]:
df

Unnamed: 0,congress,speaker,text_raw
0,105,abraham,"Mr President, during debate on final passage o..."
1,105,akaka,"Mr President, I am pleased that the Senate pas..."
2,105,allard,"Mr President, I rise to make a few remarks con..."
3,105,ashcroft,"Mr President, the Senate is not in order I wou..."
4,105,baucus,I understand that the House has sent the Senat...
...,...,...,...
94,105,thurmond,"Mr President, as the Senate considers HR 2263,..."
95,105,torricelli,"Mr President I thank Senator Snowe, Senator Mc..."
96,105,warner,"During the past two weeks, the Senate Armed Se..."
97,105,wellstone,"Mr President, today, I would like to call atte..."


In [204]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [205]:
def preprocessing_text(text):
    words = word_tokenize(text.lower())
    tokens = [word for word in words if word not in string.punctuation]
    tokens = [token for token in tokens if token not in stopwords_final]
    return tokens

In [206]:
df['text']=0

In [207]:
for text in df['text_raw']:
    a=df[df['text_raw']==text].index.values
    df['text'][a[0]]=preprocessing_text(text)

In [208]:
df

Unnamed: 0,congress,speaker,text_raw,text
0,105,abraham,"Mr President, during debate on final passage o...","[president, debate, final, passage, omnibus, a..."
1,105,akaka,"Mr President, I am pleased that the Senate pas...","[president, pleased, senate, passed, hr, 4110,..."
2,105,allard,"Mr President, I rise to make a few remarks con...","[president, rise, remarks, concerning, auto, c..."
3,105,ashcroft,"Mr President, the Senate is not in order I wou...","[president, senate, hear, senator, senate, hap..."
4,105,baucus,I understand that the House has sent the Senat...,"[understand, house, sent, senate, substitute, ..."
...,...,...,...,...
94,105,thurmond,"Mr President, as the Senate considers HR 2263,...","[president, senate, considers, hr, 2263, bill,..."
95,105,torricelli,"Mr President I thank Senator Snowe, Senator Mc...","[president, thank, senator, senator, majority,..."
96,105,warner,"During the past two weeks, the Senate Armed Se...","[past, weeks, senate, armed, services, committ..."
97,105,wellstone,"Mr President, today, I would like to call atte...","[president, call, attention, issue, addressed,..."


In [209]:
#for the next challange
biden = df[df['speaker']=='biden']

In [210]:
#get the preporcesssed text in list for gensim analysis
text_list=[]
for i in df['text']:
    text_list.append(i)

In [211]:
#get index of biden's speech
a=df[df['speaker']=='biden'].index.values
biden_index=a[0]

In [212]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [213]:
text=[]
for i in text_list:
    text.append(TaggedDocument(words=i, tags=[text_list.index(i)]))
    

In [214]:
model.build_vocab(text)

In [215]:
model.train(text, total_examples=model.corpus_count, epochs=model.epochs)

In [216]:
ranks=[]
second_ranks=[]
inferred_vector = model.infer_vector(text[biden_index].words)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
rank = [docid for docid, sim in sims].index(biden_index)
ranks.append(rank)

second_ranks.append(sims[1])

In [217]:
sims

[(6, 0.9980720281600952),
 (74, 0.5982862114906311),
 (68, 0.5097454786300659),
 (17, 0.4732208549976349),
 (15, 0.4730199873447418),
 (71, 0.4651886522769928),
 (96, 0.45944440364837646),
 (90, 0.4591034948825836),
 (58, 0.45534995198249817),
 (42, 0.43597540259361267),
 (33, 0.4340652525424957),
 (27, 0.4004562795162201),
 (20, 0.4002732038497925),
 (11, 0.3913393020629883),
 (34, 0.3821899890899658),
 (48, 0.37609317898750305),
 (45, 0.37391313910484314),
 (80, 0.36804211139678955),
 (12, 0.3647206127643585),
 (41, 0.3628920614719391),
 (93, 0.3581749200820923),
 (31, 0.34977468848228455),
 (61, 0.3490815758705139),
 (35, 0.3472757935523987),
 (1, 0.34714967012405396),
 (9, 0.34450188279151917),
 (38, 0.3418550491333008),
 (0, 0.33925896883010864),
 (66, 0.32945555448532104),
 (83, 0.31640690565109253),
 (81, 0.3118187487125397),
 (47, 0.30910566449165344),
 (4, 0.3058752417564392),
 (94, 0.3039426803588867),
 (13, 0.29458087682724),
 (26, 0.28929343819618225),
 (64, 0.2867645621299

In [218]:
df['speaker'][sims[1][0]]

'moynihan'

In [219]:
print(u'%s %s' %('Most similar speech belongs to Senator', df['speaker'][sims[1][0]].capitalize()))

Most similar speech belongs to Senator Moynihan


When we did the cosine similarity on the first part of the course, we got that the most similar speech is Senator Smith's speech. However, we get the different result when we tried word2vec. Also one should notice that the result is not reproducable. The reason is the randomization that word2vec uses on the core of the process, everytim we set up the model the similarities change. But if you run the code multiple times, you can see that the it is usually the same three-four senator that have the most similar speeches to Biden. 


### Challenge: What are the top ten words mentioned by Biden in the 105 congress (after stopwords removal)? For each most frequent word, find the 10 most simlar words generated using word2vec. Find the most frequent bigrams in the text. Explore some bigrams and figure out if the tokens appear in the list of most similar words.

In [240]:
biden=df.loc[df["speaker"] == "biden", "text_raw"].item()

In [242]:
biden_dict=Counter(biden)

In [247]:
most_used= sorted(biden_dict.items(), key=lambda kv: kv[1], reverse=True)[1:11]

In [257]:
most_used

[('president', 1517),
 ("'s", 1345),
 ('senator', 1003),
 ('nato', 941),
 ('united', 653),
 ('time', 635),
 ("n't", 591),
 ('bill', 570),
 ('amendment', 480),
 ('senate', 476)]

In [260]:
most_similar={}
model = Word2Vec(sentences=most_used, vector_size=100, window=5, min_count=1, workers=4)


In [261]:
for i in most_used:
    vector = model.wv[i]
    most_similar[i[0]]=model.wv.most_similar(i[0], topn=10)
 

{'president': [('nato', 0.31900984048843384),
  (591, 0.16207124292850494),
  (1003, 0.11074147373437881),
  ('time', 0.09731774032115936),
  (941, 0.09673886746168137),
  (653, 0.08635810762643814),
  (476, 0.0024360644165426493),
  ('bill', 0.0007033413276076317),
  (480, 0.00048271374544128776),
  ("'s", -0.0017842642264440656)],
 "'s": [(591, 0.17826788127422333),
  ('senator', 0.13149337470531464),
  (941, 0.07497557997703552),
  ('senate', 0.06797593086957932),
  (1003, 0.04157734662294388),
  ('time', 0.04130810499191284),
  ('amendment', 0.0411943756043911),
  (653, 0.012979976832866669),
  ("n't", 0.0065984539687633514),
  ('president', -0.001784270629286766)],
 'senator': [('time', 0.1669393926858902),
  (1345, 0.1388825923204422),
  ("'s", 0.13149335980415344),
  ('amendment', 0.07171675562858582),
  (1517, 0.06410215049982071),
  (941, 0.06059248372912407),
  (570, 0.04768548533320427),
  ("n't", 0.04410775750875473),
  (653, 0.020004643127322197),
  (1003, 0.01923763565719

I stored the most similar words to a dictionary, where the keys are the most used words and items are the list of 10 most similar words

In [264]:
type(biden)

list

In [278]:
bigrams = zip(biden, biden[1:])
counts = Counter(bigrams)
most_common=counts.most_common
most_common(10)

[(('united', 'nations'), 161),
 (('foreign', 'policy'), 160),
 (('chemical', 'weapons'), 155),
 (('nato', 'enlargement'), 111),
 (('foreign', 'relations'), 110),
 (('yield', 'floor'), 96),
 (('president', "'s"), 92),
 (('relations', 'committee'), 87),
 (('nuclear', 'weapons'), 86),
 (('north', 'carolina'), 80)]

Some of the tokens that appear in the most common bigrams appear in most similar words as well. For example, `Nato`, `president`, `united` etc. But they are not that common.