# Text analysis of Trump's tweets

Stylometry: to find out the style of writing or frequency of words from Mr. Donald Trump

In [1]:
import pandas as pd

In [2]:
#read the data file we scraped from Donald Trump's twitter account
df = pd.read_csv('Trump(processed).csv', encoding='utf8')
#display the first ten row
df.head(10)

Unnamed: 0,type,id,date,tweet,comments,retweets,likes
0,fake,1326926226888544256,2020-11-12 16:34:00+00:00,“REPORT: DOMINION DELETED 2.7 MILLION TRUMP VO...,17300000.0,24700000.0,61900000.0
1,fake,1326920264203046915,2020-11-12 16:10:18+00:00,.@FoxNews daytime ratings have completely coll...,9000000.0,8600000.0,34800000.0
2,fake,1326884956749127680,2020-11-12 13:50:00+00:00,"“OK, I’ve seen enough. What’s going to happen ...",3600000.0,8000000.0,31800000.0
3,fake,1326679385966047236,2020-11-12 00:13:08+00:00,Nobody wants to report that Pennsylvania and M...,8900000.0,1600000.0,41000000.0
4,fact,1326673766915641345,2020-11-11 23:50:49+00:00,I am pleased to announce that I have given my ...,4100000.0,7100000.0,35800000.0
5,fake,1326673298692972544,2020-11-11 23:48:57+00:00,Everyone is asking why the recent presidential...,5800000.0,8600000.0,49300000.0
6,fact,1326525851752656898,2020-11-11 14:03:03+00:00,"A guy named Al Schmidt, a Philadelphia Commiss...",6700000.0,7400000.0,33400000.0
7,fake,1326519025552265216,2020-11-11 13:35:55+00:00,The Fake Pollsters at @ABC/@washingtonpost pro...,3800000.0,6900000.0,29800000.0
8,fact,1326342742801326083,2020-11-11 01:55:26+00:00,Andrew McCabe was exposed for who he is today ...,2000000.0,6600000.0,29200000.0
9,fake,1326327582074220544,2020-11-11 00:55:12+00:00,"“I don’t care what state you’re in, this compu...",4800000.0,9200000.0,39900000.0


In [3]:
#create data frame for the tweets
df_Trumptweets = pd.DataFrame(df['tweet'], columns=["tweet"])

In [4]:
#display the first ten row
df_Trumptweets.head(10)

Unnamed: 0,tweet
0,“REPORT: DOMINION DELETED 2.7 MILLION TRUMP VO...
1,.@FoxNews daytime ratings have completely coll...
2,"“OK, I’ve seen enough. What’s going to happen ..."
3,Nobody wants to report that Pennsylvania and M...
4,I am pleased to announce that I have given my ...
5,Everyone is asking why the recent presidential...
6,"A guy named Al Schmidt, a Philadelphia Commiss..."
7,The Fake Pollsters at @ABC/@washingtonpost pro...
8,Andrew McCabe was exposed for who he is today ...
9,"“I don’t care what state you’re in, this compu..."


In [5]:
df_Trumptweets['word_count'] = df_Trumptweets['tweet'].apply(lambda x: len(str(x).split(" ")))
# count the word for each column
df_Trumptweets[['tweet', 'word_count']].head(10)
#[['titles', 'word_count']] is used to set the format of the table, the row

Unnamed: 0,tweet,word_count
0,“REPORT: DOMINION DELETED 2.7 MILLION TRUMP VO...,38
1,.@FoxNews daytime ratings have completely coll...,43
2,"“OK, I’ve seen enough. What’s going to happen ...",30
3,Nobody wants to report that Pennsylvania and M...,45
4,I am pleased to announce that I have given my ...,45
5,Everyone is asking why the recent presidential...,27
6,"A guy named Al Schmidt, a Philadelphia Commiss...",49
7,The Fake Pollsters at @ABC/@washingtonpost pro...,46
8,Andrew McCabe was exposed for who he is today ...,52
9,"“I don’t care what state you’re in, this compu...",28


In [6]:
#removing empty values if any
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].fillna("")

# Normalization

In [7]:
#split the title, turn every word into lower case and join them together
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))
#display the first ten row
df_Trumptweets['tweet'].head(10)

0    “report: dominion deleted 2.7 million trump vo...
1    .@foxnews daytime ratings have completely coll...
2    “ok, i’ve seen enough. what’s going to happen ...
3    nobody wants to report that pennsylvania and m...
4    i am pleased to announce that i have given my ...
5    everyone is asking why the recent presidential...
6    a guy named al schmidt, a philadelphia commiss...
7    the fake pollsters at @abc/@washingtonpost pro...
8    andrew mccabe was exposed for who he is today ...
9    “i don’t care what state you’re in, this compu...
Name: tweet, dtype: object

In [8]:
#removing punctuations
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].str.replace('[^\w\s]', '')
df_Trumptweets['tweet'].head(10)

0    report dominion deleted 27 million trump votes...
1    foxnews daytime ratings have completely collap...
2    ok ive seen enough whats going to happen to th...
3    nobody wants to report that pennsylvania and m...
4    i am pleased to announce that i have given my ...
5    everyone is asking why the recent presidential...
6    a guy named al schmidt a philadelphia commissi...
7    the fake pollsters at abcwashingtonpost produc...
8    andrew mccabe was exposed for who he is today ...
9    i dont care what state youre in this computer ...
Name: tweet, dtype: object

In [9]:
import nltk

In [10]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
#import the stop words list

In [11]:
stop
#A list to know what are stop words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
# removing stop words(word has no specific meaning eg: on, the)
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].apply(
    lambda x: " ".join(x for x in x.split() if x not in stop))
df_Trumptweets['tweet'].head(10)

0    report dominion deleted 27 million trump votes...
1    foxnews daytime ratings completely collapsed w...
2    ok ive seen enough whats going happen guys mcc...
3    nobody wants report pennsylvania michigan didn...
4    pleased announce given full support endorsemen...
5    everyone asking recent presidential polls inac...
6    guy named al schmidt philadelphia commissioner...
7    fake pollsters abcwashingtonpost produced poss...
8    andrew mccabe exposed today us senate totally ...
9    dont care state youre computer voting system w...
Name: tweet, dtype: object

In [13]:
import numpy as np
#removing number
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].str.replace('\d+', '')
df_Trumptweets['tweet'].replace(' ', np.nan, inplace=True)

In [14]:
#download textblob package
!pip install -U textblob 
!python -m textblob.download_corpora

Requirement already up-to-date: textblob in d:\anaconda\lib\site-packages (0.15.3)
Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Pa

In [15]:
from textblob import Word
#converts the word into its root word
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].apply(
    lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df_Trumptweets['tweet'].head(10)

0    report dominion deleted million trump vote nat...
1    foxnews daytime rating completely collapsed we...
2    ok ive seen enough whats going happen guy mcca...
3    nobody want report pennsylvania michigan didnt...
4    pleased announce given full support endorsemen...
5    everyone asking recent presidential poll inacc...
6    guy named al schmidt philadelphia commissioner...
7    fake pollster abcwashingtonpost produced possi...
8    andrew mccabe exposed today u senate totally d...
9    dont care state youre computer voting system w...
Name: tweet, dtype: object

In [16]:
string=' '.join(df_Trumptweets['tweet'])
# join all title into a string

In [17]:
lst=string.split()
#split the word

In [18]:
srs = pd.Series(lst) 
#store the splited word vertically

In [19]:
srs.value_counts()
#word count of each 

vote          18
election      17
ballot        10
state         10
win           10
              ..
leadership     1
matthew        1
learned        1
know           1
tabulation     1
Length: 468, dtype: int64

In [20]:
srs.value_counts()[:15] 
#show the top 15

vote            18
election        17
ballot          10
state           10
win             10
president        8
vaccine          7
u                7
big              7
pennsylvania     7
trump            6
allowed          6
would            6
legal            6
republican       5
dtype: int64

In [21]:
# common words screening 
freq_common = pd.Series(' '.join(df_Trumptweets['tweet']).split()).value_counts()[:15]
freq_common

vote            18
election        17
ballot          10
state           10
win             10
president        8
vaccine          7
u                7
big              7
pennsylvania     7
trump            6
allowed          6
would            6
legal            6
republican       5
dtype: int64

In [22]:
# rare words screening
freq_rare = pd.Series(' '.join(
    df_Trumptweets['tweet']).split()).value_counts()[-15:]
freq_rare

andor                1
perhaps              1
abcwashingtonpost    1
integrity            1
stock                1
schmidt              1
progress             1
produced             1
seen                 1
bad                  1
leadership           1
matthew              1
learned              1
know                 1
tabulation           1
dtype: int64

# TF-IDF

In [23]:
#distinguishing Trump's tweets with others
tf1 = df_Trumptweets['tweet'].apply(lambda x: pd.value_counts(x.split(" "))).sum(
    axis=0).reset_index()

tf1.columns = ['words', 'tf']
tf1.sort_values(['tf'], ascending=False).head(15)

Unnamed: 0,words,tf
1,vote,18.0
27,election,17.0
66,win,10.0
8,state,10.0
176,ballot,10.0
15,president,8.0
229,vaccine,7.0
124,big,7.0
6,pennsylvania,7.0
93,u,7.0
