# INSTRUCTIONS
This Jupyter Notebook is for the A4 assignment of collecting data.

I used the full speeches that Donald Trump gave at 35 of his rallies until 2021.


In [20]:
# Import spacy
import spacy

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

In [21]:
# Create empty lists for file names and contents
texts = []
file_names = []

# Iterate through each file in the folder
for _file_name in os.listdir('archive'):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts.append(open('archive' + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names.append(_file_name)

In [22]:
# Create dictionary object associating each file name with its text
d = {'Filename':file_names,'Text':texts}

In [23]:
# Turn dictionary into a dataframe
speech_df = pd.DataFrame(d)

In [24]:
speech_df.head()

Unnamed: 0,Filename,Text
0,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...
1,TupeloNov1_2019.txt,"ell, thank you very much. And hello, Tupelo. T..."
2,NewHampshireAug15_2019.txt,Thank you very much everybody. Thank you. Wow...
3,HendersonSep13_2020.txt,"Thank you, thank you. Wow. Wow, and I'm thrill..."
4,OhioSep21_2020.txt,"Wow, that's a big crowd. This is a big crowd. ..."


In [25]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

In [26]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [27]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each speech
speech_df['Doc'] = speech_df['Text'].apply(process_text) 

In [28]:
speech_df.head(5)

Unnamed: 0,Filename,Text,Doc
0,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...,"( , Thank, you, everybody, ., Thank, you, and,..."
1,TupeloNov1_2019.txt,"ell, thank you very much. And hello, Tupelo. T...","(ell, ,, thank, you, very, much, ., And, hello..."
2,NewHampshireAug15_2019.txt,Thank you very much everybody. Thank you. Wow...,"( , Thank, you, very, much, everybody, ., Than..."
3,HendersonSep13_2020.txt,"Thank you, thank you. Wow. Wow, and I'm thrill...","(Thank, you, ,, thank, you, ., Wow, ., Wow, ,,..."
4,OhioSep21_2020.txt,"Wow, that's a big crowd. This is a big crowd. ...","(Wow, ,, that, 's, a, big, crowd, ., This, is,..."


In [29]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [30]:
# Run the token retrieval function on the doc objects in the dataframe
speech_df['Tokens'] = speech_df['Doc'].apply(get_token)
speech_df.head()

Unnamed: 0,Filename,Text,Doc,Tokens
0,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...,"( , Thank, you, everybody, ., Thank, you, and,...","[ , Thank, you, everybody, ., Thank, you, and,..."
1,TupeloNov1_2019.txt,"ell, thank you very much. And hello, Tupelo. T...","(ell, ,, thank, you, very, much, ., And, hello...","[ell, ,, thank, you, very, much, ., And, hello..."
2,NewHampshireAug15_2019.txt,Thank you very much everybody. Thank you. Wow...,"( , Thank, you, very, much, everybody, ., Than...","[ , Thank, you, very, much, everybody, ., Than..."
3,HendersonSep13_2020.txt,"Thank you, thank you. Wow. Wow, and I'm thrill...","(Thank, you, ,, thank, you, ., Wow, ., Wow, ,,...","[Thank, you, ,, thank, you, ., Wow, ., Wow, ,,..."
4,OhioSep21_2020.txt,"Wow, that's a big crowd. This is a big crowd. ...","(Wow, ,, that, 's, a, big, crowd, ., This, is,...","[Wow, ,, that, 's, a, big, crowd, ., This, is,..."


In [31]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
speech_df['Lemmas'] = speech_df['Doc'].apply(get_lemma)
speech_df.head(5)

Unnamed: 0,Filename,Text,Doc,Tokens,Lemmas
0,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...,"( , Thank, you, everybody, ., Thank, you, and,...","[ , Thank, you, everybody, ., Thank, you, and,...","[ , thank, you, everybody, ., thank, you, and,..."
1,TupeloNov1_2019.txt,"ell, thank you very much. And hello, Tupelo. T...","(ell, ,, thank, you, very, much, ., And, hello...","[ell, ,, thank, you, very, much, ., And, hello...","[ell, ,, thank, you, very, much, ., and, hello..."
2,NewHampshireAug15_2019.txt,Thank you very much everybody. Thank you. Wow...,"( , Thank, you, very, much, everybody, ., Than...","[ , Thank, you, very, much, everybody, ., Than...","[ , thank, you, very, much, everybody, ., than..."
3,HendersonSep13_2020.txt,"Thank you, thank you. Wow. Wow, and I'm thrill...","(Thank, you, ,, thank, you, ., Wow, ., Wow, ,,...","[Thank, you, ,, thank, you, ., Wow, ., Wow, ,,...","[thank, you, ,, thank, you, ., wow, ., wow, ,,..."
4,OhioSep21_2020.txt,"Wow, that's a big crowd. This is a big crowd. ...","(Wow, ,, that, 's, a, big, crowd, ., This, is,...","[Wow, ,, that, 's, a, big, crowd, ., This, is,...","[wow, ,, that, be, a, big, crowd, ., this, be,..."


In [32]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
speech_df['POS'] = speech_df['Doc'].apply(get_pos)

In [35]:
speech_df.head(5)

Unnamed: 0,Filename,Text,Doc,Tokens,Lemmas,POS
0,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...,"( , Thank, you, everybody, ., Thank, you, and,...","[ , Thank, you, everybody, ., Thank, you, and,...","[ , thank, you, everybody, ., thank, you, and,...","[(SPACE, _SP), (VERB, VBP), (PRON, PRP), (PRON..."
1,TupeloNov1_2019.txt,"ell, thank you very much. And hello, Tupelo. T...","(ell, ,, thank, you, very, much, ., And, hello...","[ell, ,, thank, you, very, much, ., And, hello...","[ell, ,, thank, you, very, much, ., and, hello...","[(INTJ, UH), (PUNCT, ,), (VERB, VBP), (PRON, P..."
2,NewHampshireAug15_2019.txt,Thank you very much everybody. Thank you. Wow...,"( , Thank, you, very, much, everybody, ., Than...","[ , Thank, you, very, much, everybody, ., Than...","[ , thank, you, very, much, everybody, ., than...","[(SPACE, _SP), (VERB, VBP), (PRON, PRP), (ADV,..."
3,HendersonSep13_2020.txt,"Thank you, thank you. Wow. Wow, and I'm thrill...","(Thank, you, ,, thank, you, ., Wow, ., Wow, ,,...","[Thank, you, ,, thank, you, ., Wow, ., Wow, ,,...","[thank, you, ,, thank, you, ., wow, ., wow, ,,...","[(VERB, VBP), (PRON, PRP), (PUNCT, ,), (VERB, ..."
4,OhioSep21_2020.txt,"Wow, that's a big crowd. This is a big crowd. ...","(Wow, ,, that, 's, a, big, crowd, ., This, is,...","[Wow, ,, that, 's, a, big, crowd, ., This, is,...","[wow, ,, that, be, a, big, crowd, ., this, be,...","[(INTJ, UH), (PUNCT, ,), (PRON, DT), (AUX, VBZ..."


In [36]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
speech_df['Proper_Nouns'] = speech_df['Doc'].apply(extract_proper_nouns)

In [40]:
list(speech_df.loc[[3, 34], 'Proper_Nouns'])

[['Henderson',
  'Nevada',
  'White',
  'House',
  'Los',
  'Angeles',
  'God',
  'America',
  'hell',
  'Las',
  'Vegas',
  'Sleepy',
  'Joe',
  'Biden',
  'Dana',
  'White',
  'Dana',
  'White',
  'Sir',
  'Joe',
  'Biden',
  'Joe',
  'Biden',
  'Biden',
  'Biden',
  'Joe',
  'Biden',
  'top-30',
  'Democrat',
  'Democrat',
  'Democrat',
  'run',
  'Minneapolis',
  'National',
  'Guard',
  'Seattle',
  'Seattle',
  'Portland',
  'US',
  'Marshalls',
  'US',
  'Marshalls',
  'Sleepy',
  'Joe',
  'Bernie',
  'Sleepy',
  'Joe',
  'Joe',
  'President',
  'Xi',
  'China',
  'President',
  'Putin',
  'Russia',
  'China',
  'China',
  'China',
  'China',
  'Europe',
  'Sleepy',
  'Joe',
  'Kim',
  'Jong',
  'un',
  'Kim',
  'Jong',
  'un',
  'Joe',
  'Joe',
  'Joe',
  'New',
  'York',
  'New',
  'York',
  'New',
  'York',
  'New',
  'York',
  'Chicago',
  'Fraternal',
  'Order',
  'Police',
  'Chicago',
  'Oklahoma',
  'Texas',
  'Florida',
  'New',
  'York',
  'Chicago',
  'Ohio',
  'New',

In [42]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
speech_df['Named_Entities'] = speech_df['Doc'].apply(extract_named_entities)
speech_df['Named_Entities']

0     [PERSON, GPE, GPE, GPE, ORDINAL, DATE, GPE, OR...
1     [PERSON, TIME, GPE, GPE, PERSON, CARDINAL, DAT...
2     [ORG, GPE, ORDINAL, ORDINAL, CARDINAL, CARDINA...
3     [GPE, CARDINAL, NORP, DATE, GPE, DATE, FAC, DA...
4     [PERSON, GPE, GPE, CARDINAL, NORP, DATE, GPE, ...
5     [GPE, GPE, GPE, CARDINAL, DATE, NORP, GPE, TIM...
6     [PERSON, DATE, GPE, GPE, GPE, CARDINAL, CARDIN...
7     [GPE, GPE, ORG, TIME, TIME, DATE, ORG, GPE, GP...
8     [GPE, GPE, CARDINAL, GPE, PERSON, PERSON, GPE,...
9     [GPE, GPE, GPE, DATE, LOC, CARDINAL, NORP, DAT...
10    [GPE, GPE, PERSON, DATE, GPE, DATE, FAC, PERSO...
11    [GPE, TIME, GPE, PERSON, ORG, NORP, NORP, GPE,...
12    [CARDINAL, GPE, MONEY, GPE, ORG, GPE, GPE, NOR...
13    [GPE, DATE, CARDINAL, CARDINAL, CARDINAL, TIME...
14    [PERSON, PERSON, PERSON, PERSON, QUANTITY, PER...
15    [PERSON, GPE, GPE, DATE, DATE, DATE, ORDINAL, ...
16    [GPE, GPE, CARDINAL, NORP, GPE, DATE, GPE, PER...
17    [GPE, GPE, GPE, CARDINAL, ORG, DATE, DATE,

In [43]:
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
speech_df['NE_Words'] = speech_df['Doc'].apply(extract_named_entities)
speech_df['NE_Words']

0     [(Mike, Pence), (Fayetteville), (Fayetteville)...
1     [(Tupelo), (tonight), (Mississippi), (Mississi...
2     [(Manchester), (New, Hampshire), (third), (fou...
3     [(Henderson), (Thousands), (American), (51, da...
4     [(Swanton), (Toledo), (Ohio), (thousands), (Am...
5     [(Phoenix), (Phoenix), (Arizona), (thousands),...
6     [(Pence), (Christmas), (Michigan), (Michigan),...
7     [(California), (Pennsylvania), (Trump), (a, fe...
8     [(Houston), (Texas), (one), (America), (Modi, ...
9     [(Colorado), (Colorado), (Colorado, Springs), ...
10    [(Pennsylvania), (Latrobe), (Arnold, Palmer), ...
11    [(Dallas), (tonight), (Texas), (Louie, Vuitton...
12    [(two), (Japan), (three, ,, 40, billion), (Jap...
13    [(Minnesota), (the, day), (20,000, -, plus), (...
14    [(Obama), (Joe, Biden), (Art), (Brandon), (122...
15    [(Mike, Pence), (Toledo), (Toledo), (2016), (a...
16    [(North, Carolina), (Charlotte), (thousands), ...
17    [(North, Carolina), (North, Carolina), (Wi

In [45]:
# Extract the first Doc object
doc = speech_df['Doc'][1]

# Visualize named entity tagging in a single speech
displacy.render(doc, style='ent', jupyter=True)

In [46]:
# Save DataFrame as csv (in Google Drive)
# Use this step only to save  csv to your computer's working directory
speech_df.to_csv('Trump_speeches_with_spaCy_tags.csv')

In [47]:
speech_df

Unnamed: 0,Filename,Text,Doc,Tokens,Lemmas,POS,Proper_Nouns,Named_Entities,NE_Words
0,FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...,"( , Thank, you, everybody, ., Thank, you, and,...","[ , Thank, you, everybody, ., Thank, you, and,...","[ , thank, you, everybody, ., thank, you, and,...","[(SPACE, _SP), (VERB, VBP), (PRON, PRP), (PRON...","[Vice, President, Mike, Pence, Fayetteville, H...","[PERSON, GPE, GPE, GPE, ORDINAL, DATE, GPE, OR...","[(Mike, Pence), (Fayetteville), (Fayetteville)..."
1,TupeloNov1_2019.txt,"ell, thank you very much. And hello, Tupelo. T...","(ell, ,, thank, you, very, much, ., And, hello...","[ell, ,, thank, you, very, much, ., And, hello...","[ell, ,, thank, you, very, much, ., and, hello...","[(INTJ, UH), (PUNCT, ,), (VERB, VBP), (PRON, P...","[Tupelo, Mississippi, Mississippi, Elvis, Pres...","[PERSON, TIME, GPE, GPE, PERSON, CARDINAL, DAT...","[(Tupelo), (tonight), (Mississippi), (Mississi..."
2,NewHampshireAug15_2019.txt,Thank you very much everybody. Thank you. Wow...,"( , Thank, you, very, much, everybody, ., Than...","[ , Thank, you, very, much, everybody, ., Than...","[ , thank, you, very, much, everybody, ., than...","[(SPACE, _SP), (VERB, VBP), (PRON, PRP), (ADV,...","[Manchester, New, Hampshire, Patriots, America...","[ORG, GPE, ORDINAL, ORDINAL, CARDINAL, CARDINA...","[(Manchester), (New, Hampshire), (third), (fou..."
3,HendersonSep13_2020.txt,"Thank you, thank you. Wow. Wow, and I'm thrill...","(Thank, you, ,, thank, you, ., Wow, ., Wow, ,,...","[Thank, you, ,, thank, you, ., Wow, ., Wow, ,,...","[thank, you, ,, thank, you, ., wow, ., wow, ,,...","[(VERB, VBP), (PRON, PRP), (PUNCT, ,), (VERB, ...","[Henderson, Nevada, White, House, Los, Angeles...","[GPE, CARDINAL, NORP, DATE, GPE, DATE, FAC, DA...","[(Henderson), (Thousands), (American), (51, da..."
4,OhioSep21_2020.txt,"Wow, that's a big crowd. This is a big crowd. ...","(Wow, ,, that, 's, a, big, crowd, ., This, is,...","[Wow, ,, that, 's, a, big, crowd, ., This, is,...","[wow, ,, that, be, a, big, crowd, ., this, be,...","[(INTJ, UH), (PUNCT, ,), (PRON, DT), (AUX, VBZ...","[Swanton, Toledo, Ohio, Ohio, Bob, Paduchik, B...","[PERSON, GPE, GPE, CARDINAL, NORP, DATE, GPE, ...","[(Swanton), (Toledo), (Ohio), (thousands), (Am..."
5,PhoenixFeb19_2020.txt,"Thank you very much, Phoenix. We love to be b...","( , Thank, you, very, much, ,, Phoenix, ., We,...","[ , Thank, you, very, much, ,, Phoenix, ., We,...","[ , thank, you, very, much, ,, Phoenix, ., we,...","[(SPACE, _SP), (VERB, VBP), (PRON, PRP), (ADV,...","[Phoenix, Phoenix, Arizona, God, November, Dem...","[GPE, GPE, GPE, CARDINAL, DATE, NORP, GPE, TIM...","[(Phoenix), (Phoenix), (Arizona), (thousands),..."
6,BattleCreekDec19_2019.txt,Thank you. Thank you. Thank you to Vice Presid...,"(Thank, you, ., Thank, you, ., Thank, you, to,...","[Thank, you, ., Thank, you, ., Thank, you, to,...","[thank, you, ., thank, you, ., thank, you, to,...","[(VERB, VBP), (PRON, PRP), (PUNCT, .), (VERB, ...","[Vice, President, Pence, Merry, Christmas, Mic...","[PERSON, DATE, GPE, GPE, GPE, CARDINAL, CARDIN...","[(Pence), (Christmas), (Michigan), (Michigan),..."
7,PittsburghSep22_2020.txt,Doesn't have the power. Doesn't have the stayi...,"(Does, n't, have, the, power, ., Does, n't, ha...","[Does, n't, have, the, power, ., Does, n't, ha...","[do, not, have, the, power, ., do, not, have, ...","[(AUX, VBZ), (PART, RB), (VERB, VB), (DET, DT)...","[California, Pennsylvania, Commonwealth, Penns...","[GPE, GPE, ORG, TIME, TIME, DATE, ORG, GPE, GP...","[(California), (Pennsylvania), (Trump), (a, fe..."
8,TexasSep23_2019.txt,"Hello, Houston. I am so thrilled to be here in...","(Hello, ,, Houston, ., I, am, so, thrilled, to...","[Hello, ,, Houston, ., I, am, so, thrilled, to...","[hello, ,, Houston, ., I, be, so, thrilled, to...","[(INTJ, UH), (PUNCT, ,), (PROPN, NNP), (PUNCT,...","[Houston, Texas, America, Prime, Minister, Mod...","[GPE, GPE, CARDINAL, GPE, PERSON, PERSON, GPE,...","[(Houston), (Texas), (one), (America), (Modi, ..."
9,ColoradorSpringsFeb20_2020.txt,"Hello Colorado. We love Colorado, most beautif...","(Hello, Colorado, ., We, love, Colorado, ,, mo...","[Hello, Colorado, ., We, love, Colorado, ,, mo...","[hello, Colorado, ., we, love, Colorado, ,, mo...","[(INTJ, UH), (PROPN, NNP), (PUNCT, .), (PRON, ...","[Colorado, Colorado, Colorado, Springs, Rocky,...","[GPE, GPE, GPE, DATE, LOC, CARDINAL, NORP, DAT...","[(Colorado), (Colorado), (Colorado, Springs), ..."
