<a href="https://colab.research.google.com/github/mion158/Language-and-Chatbot/blob/main/US-presidents-speeches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import os
import gensim
import spacy

from nltk.tokenize import PunktSentenceTokenizer
from collections import Counter

In [20]:
# get list of all speech files
files = sorted([file for file in os.listdir() if file[-4:] == '.txt'])

# read each speech file
def read_file(file_name):
  with open(file_name, 'r+', encoding='utf-8') as file:
    file_text = file.read()
  return file_text

speeches = [read_file(speech) for speech in files]

In [21]:
# preprocess each speech
def process_speeches(speeches):
  word_tokenized_speeches = list()
  for speech in speeches:
    sentence_tokenizer = PunktSentenceTokenizer()
    sentence_tokenized_speech = sentence_tokenizer.tokenize(speech)
    word_tokenized_sentences = list()
    for sentence in sentence_tokenized_speech:
      word_tokenized_sentence = [word.lower().strip('.').strip('?').strip('!') for word in sentence.replace(",","").replace("-"," ").replace(":","").split()]
      word_tokenized_sentences.append(word_tokenized_sentence)
    word_tokenized_speeches.append(word_tokenized_sentences)
  return word_tokenized_speeches

tokenized_speeches = process_speeches(speeches)

In [22]:
# merge speeches
def merge_speeches(speeches):
  all_sentences = list()
  for speech in speeches:
    for sentence in speech:
      all_sentences.append(sentence)
  return all_sentences

all_sentences = merge_speeches(tokenized_speeches)
print(type(all_sentences))

<class 'list'>


In [25]:
# view most frequently used words
def most_frequent_words(list_of_sentences):
  all_words = [word for sentence in list_of_sentences for word in sentence]
  return Counter(all_words).most_common()

frequent_words = most_frequent_words(all_sentences)
print(frequent_words)



In [26]:
# create gensim model of all speeches

all_presidents_embeddings = gensim.models.Word2Vec(all_sentences, size=96, window=5, min_count=1, workers=2, sg=1)
# view words similar to freedom
similar_to_freedom = all_presidents_embeddings.most_similar('freedom',topn=20)
similar_to_freedom

  """


[('independence', 0.9841150045394897),
 ('human', 0.9771022796630859),
 ('business', 0.9765868186950684),
 ('increase', 0.9757341146469116),
 ('parts', 0.9752916097640991),
 ('order', 0.9751555919647217),
 ('race', 0.9740780591964722),
 ('individual', 0.9739705324172974),
 ('department', 0.9738984107971191),
 ('defense', 0.973170280456543),
 ('institutions', 0.9729503393173218),
 ('civil', 0.972895085811615),
 ('republican', 0.9728653430938721),
 ('forms', 0.9722763299942017),
 ('citizen', 0.9722399711608887),
 ('international', 0.9721020460128784),
 ('wealth', 0.9720688462257385),
 ('power', 0.9716906547546387),
 ('sovereignty', 0.9713634848594666),
 ('political', 0.9711347818374634)]

In [28]:
# get President Roosevelt sentences
def get_president_sentences(president):
  files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
  speeches = [read_file(file) for file in files]
  processed_speeches = process_speeches(speeches)
  all_sentences = merge_speeches(processed_speeches)
  return all_sentences

roosevelt_sentences = get_president_sentences('franklin-d-roosevelt')
roosevelt_sentences

[['i',
  'am',
  'certain',
  'that',
  'my',
  'fellow',
  'americans',
  'expect',
  'that',
  'on',
  'my',
  'induction',
  'into',
  'the',
  'presidency',
  'i',
  'will',
  'address',
  'them',
  'with',
  'a',
  'candor',
  'and',
  'a',
  'decision',
  'which',
  'the',
  'present',
  'situation',
  'of',
  'our',
  'nation',
  'impels'],
 ['this',
  'is',
  'preeminently',
  'the',
  'time',
  'to',
  'speak',
  'the',
  'truth',
  'the',
  'whole',
  'truth',
  'frankly',
  'and',
  'boldly'],
 ['nor',
  'need',
  'we',
  'shrink',
  'from',
  'honestly',
  'facing',
  'conditions',
  'in',
  'our',
  'country',
  'today'],
 ['this',
  'great',
  'nation',
  'will',
  'endure',
  'as',
  'it',
  'has',
  'endured',
  'will',
  'revive',
  'and',
  'will',
  'prosper'],
 ['so',
  'first',
  'of',
  'all',
  'let',
  'me',
  'assert',
  'my',
  'firm',
  'belief',
  'that',
  'the',
  'only',
  'thing',
  'we',
  'have',
  'to',
  'fear',
  'is',
  'fear',
  'itself',
  'namel

In [30]:
# view most frequently used words of Roosevelt
roosevelt_most_frequent_words = most_frequent_words(roosevelt_sentences)
roosevelt_most_frequent_words

[('the', 375),
 ('of', 321),
 ('and', 179),
 ('to', 158),
 ('we', 131),
 ('a', 121),
 ('in', 119),
 ('that', 102),
 ('our', 90),
 ('it', 71),
 ('is', 67),
 ('have', 56),
 ('for', 47),
 ('be', 41),
 ('i', 40),
 ('this', 40),
 ('not', 40),
 ('by', 38),
 ('will', 35),
 ('as', 33),
 ('all', 33),
 ('are', 32),
 ('which', 29),
 ('with', 28),
 ('on', 27),
 ('has', 27),
 ('they', 27),
 ('but', 27),
 ('nation', 26),
 ('people', 25),
 ('their', 25),
 ('government', 23),
 ('can', 23),
 ('us', 20),
 ('shall', 20),
 ('democracy', 20),
 ('from', 19),
 ('an', 19),
 ('men', 18),
 ('its', 18),
 ('must', 17),
 ('who', 17),
 ('been', 16),
 ('know', 16),
 ('life', 15),
 ('spirit', 15),
 ('no', 15),
 ('because', 15),
 ('there', 15),
 ('if', 15),
 ('so', 14),
 ('at', 14),
 ('more', 13),
 ('those', 13),
 ('upon', 13),
 ('national', 12),
 ('years', 12),
 ('may', 12),
 ('new', 12),
 ('world', 12),
 ('my', 11),
 ('every', 11),
 ('these', 11),
 ('through', 11),
 ('states', 11),
 ('way', 11),
 ('good', 11),
 ('or

In [32]:
roosevelt_embeddings = gensim.models.Word2Vec(roosevelt_sentences, size=96, window=5, min_count=1, workers=2, sg=1)



In [34]:
similar_to_freedom = roosevelt_embeddings.most_similar("freedom", topn=20)
similar_to_freedom

  """Entry point for launching an IPython kernel.


[('on', 0.998755931854248),
 ('who', 0.9987292289733887),
 ('must', 0.9987282156944275),
 ('no', 0.9987032413482666),
 ('and', 0.998690128326416),
 ('or', 0.9986817836761475),
 ('an', 0.9986810088157654),
 ('all', 0.998677134513855),
 ('world', 0.9986758232116699),
 ('be', 0.9986690878868103),
 ('the', 0.9986632466316223),
 ('as', 0.9986631274223328),
 ('has', 0.998652458190918),
 ('government', 0.9986177682876587),
 ('in', 0.9986064434051514),
 ('but', 0.9985953569412231),
 ('shall', 0.9985894560813904),
 ('its', 0.9985809326171875),
 ('a', 0.9985726475715637),
 ('with', 0.9985629320144653)]

In [35]:
# get sentences of multiple presidents
def get_presidents_sentences(presidents):
  all_sentences = list()
  for president in presidents:
    files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
    speeches = [read_file(file) for file in files]
    processed_speeches = process_speeches(speeches)
    all_prez_sentences = merge_speeches(processed_speeches)
    all_sentences.extend(all_prez_sentences)
  return all_sentences

rushmore_presidents_sentences = get_presidents_sentences(["washington","jefferson","lincoln","theodore-roosevelt"])  

In [36]:
# view most frequently used words of the Rushmore
rushmore_most_frequent_words = most_frequent_words(rushmore_presidents_sentences)
rushmore_most_frequent_words

[('the', 650),
 ('of', 418),
 ('and', 341),
 ('to', 333),
 ('in', 168),
 ('that', 144),
 ('be', 130),
 ('a', 124),
 ('it', 111),
 ('is', 102),
 ('by', 93),
 ('which', 91),
 ('our', 84),
 ('i', 84),
 ('with', 82),
 ('all', 81),
 ('not', 80),
 ('as', 73),
 ('have', 71),
 ('we', 71),
 ('this', 58),
 ('will', 57),
 ('for', 55),
 ('their', 52),
 ('or', 51),
 ('but', 50),
 ('from', 49),
 ('them', 49),
 ('no', 49),
 ('are', 46),
 ('on', 44),
 ('they', 42),
 ('can', 39),
 ('government', 37),
 ('shall', 36),
 ('its', 36),
 ('any', 35),
 ('who', 34),
 ('may', 34),
 ('so', 32),
 ('constitution', 32),
 ('you', 32),
 ('us', 31),
 ('if', 31),
 ('people', 28),
 ('should', 27),
 ('union', 27),
 ('my', 25),
 ('has', 25),
 ('citizens', 24),
 ('these', 24),
 ('other', 24),
 ('would', 24),
 ('one', 24),
 ('there', 24),
 ('now', 23),
 ('own', 23),
 ('do', 23),
 ('those', 22),
 ('must', 22),
 ('public', 22),
 ('states', 22),
 ('such', 21),
 ('right', 20),
 ('law', 20),
 ('his', 20),
 ('than', 20),
 ('fellow

In [37]:
rushmore_embeddings = gensim.models.Word2Vec(rushmore_presidents_sentences, size=96, window=5, min_count=1, workers=2, sg=1)



In [38]:
similar_to_freedom = rushmore_embeddings.most_similar("freedom", topn=20)
similar_to_freedom

  """Entry point for launching an IPython kernel.


[('who', 0.9993373155593872),
 ('while', 0.9993331432342529),
 ('world', 0.9993312954902649),
 ('no', 0.9993220567703247),
 ('on', 0.999314546585083),
 ('you', 0.999308705329895),
 ('such', 0.9993037581443787),
 ('less', 0.9993025660514832),
 ('but', 0.9992976188659668),
 ('so', 0.9992945194244385),
 ('must', 0.999290406703949),
 ('government', 0.9992878437042236),
 ('its', 0.9992873668670654),
 ('an', 0.9992815852165222),
 ('or', 0.9992759227752686),
 ('should', 0.9992732405662537),
 ('all', 0.9992726445198059),
 ('shall', 0.9992674589157104),
 ('only', 0.9992610216140747),
 ('has', 0.9992598295211792)]