# Text analysis of Trump's tweets

Stylometry: to find out the style of writing or frequency of words from Mr. Donald Trump

In [1]:
import pandas as pd

In [2]:
#read the data file we scraped from Donald Trump's twitter account
df = pd.read_csv('Trump(processed).csv', encoding='utf8')
df.head(30)

Unnamed: 0,type,id,date,tweet,comments,retweets,likes
0,fake,1326926226888544256,2020-11-12 16:34:00+00:00,“REPORT: DOMINION DELETED 2.7 MILLION TRUMP VO...,17300000,24700000,61900000
1,fake,1326920264203046915,2020-11-12 16:10:18+00:00,.@FoxNews daytime ratings have completely coll...,9000000,8600000,34800000
2,fake,1326884956749127680,2020-11-12 13:50:00+00:00,"“OK, I’ve seen enough. What’s going to happen ...",3600000,8000000,31800000
3,fake,1326679385966047236,2020-11-12 00:13:08+00:00,Nobody wants to report that Pennsylvania and M...,8900000,1600000,41000000
4,fact,1326673766915641345,2020-11-11 23:50:49+00:00,I am pleased to announce that I have given my ...,4100000,7100000,35800000
5,fake,1326673298692972544,2020-11-11 23:48:57+00:00,Everyone is asking why the recent presidential...,5800000,8600000,49300000
6,fact,1326525851752656898,2020-11-11 14:03:03+00:00,"A guy named Al Schmidt, a Philadelphia Commiss...",6700000,7400000,33400000
7,fake,1326519025552265216,2020-11-11 13:35:55+00:00,The Fake Pollsters at @ABC/@washingtonpost pro...,3800000,6900000,29800000
8,fact,1326342742801326083,2020-11-11 01:55:26+00:00,Andrew McCabe was exposed for who he is today ...,2000000,6600000,29200000
9,fake,1326327582074220544,2020-11-11 00:55:12+00:00,"“I don’t care what state you’re in, this compu...",4800000,9200000,39900000


In [23]:
#create data frame for the tweets
df_Trumptweets = pd.DataFrame(df['tweet'], columns=["tweet"])

In [24]:
#display first few
df_Trumptweets.head()

Unnamed: 0,tweet
0,“REPORT: DOMINION DELETED 2.7 MILLION TRUMP VO...
1,.@FoxNews daytime ratings have completely coll...
2,"“OK, I’ve seen enough. What’s going to happen ..."
3,Nobody wants to report that Pennsylvania and M...
4,I am pleased to announce that I have given my ...


In [25]:
df_Trumptweets['word_count'] = df_Trumptweets['tweet'].apply(lambda x: len(str(x).split(" ")))
# count the word for each column
df_Trumptweets[['tweet', 'word_count']].head()
#[['titles', 'word_count']] is used to set the format of the table, the row

Unnamed: 0,tweet,word_count
0,“REPORT: DOMINION DELETED 2.7 MILLION TRUMP VO...,38
1,.@FoxNews daytime ratings have completely coll...,43
2,"“OK, I’ve seen enough. What’s going to happen ...",30
3,Nobody wants to report that Pennsylvania and M...,45
4,I am pleased to announce that I have given my ...,45


In [26]:
#removing empty values if any
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].fillna("")

# Normalization

In [27]:
#split the title, turn every word into lower case and join them together
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))
#display the first few
df_Trumptweets['tweet'].head()

0    “report: dominion deleted 2.7 million trump vo...
1    .@foxnews daytime ratings have completely coll...
2    “ok, i’ve seen enough. what’s going to happen ...
3    nobody wants to report that pennsylvania and m...
4    i am pleased to announce that i have given my ...
Name: tweet, dtype: object

In [28]:
#removing punctuations
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].str.replace('[^\w\s]', '')
df_Trumptweets['tweet'].head()

0    report dominion deleted 27 million trump votes...
1    foxnews daytime ratings have completely collap...
2    ok ive seen enough whats going to happen to th...
3    nobody wants to report that pennsylvania and m...
4    i am pleased to announce that i have given my ...
Name: tweet, dtype: object

In [29]:
import nltk

In [30]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
#import the stop words list

In [1]:
#stop
#A list to know what are stop words

In [32]:
# removing stop words(word has no specific meaning eg: on, the)
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].apply(
    lambda x: " ".join(x for x in x.split() if x not in stop))
df_Trumptweets['tweet'].head()

0    report dominion deleted 27 million trump votes...
1    foxnews daytime ratings completely collapsed w...
2    ok ive seen enough whats going happen guys mcc...
3    nobody wants report pennsylvania michigan didn...
4    pleased announce given full support endorsemen...
Name: tweet, dtype: object

In [33]:
import numpy as np
#removing number
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].str.replace('\d+', '')
df_Trumptweets['tweet'].replace(' ', np.nan, inplace=True)

In [2]:
!download textblob package
#pip install -U textblob 
!python -m textblob.download_corpora

In [35]:
from textblob import Word
#converts the word into its root word
df_Trumptweets['tweet'] = df_Trumptweets['tweet'].apply(
    lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df_Trumptweets['tweet'].head()

0    report dominion deleted million trump vote nat...
1    foxnews daytime rating completely collapsed we...
2    ok ive seen enough whats going happen guy mcca...
3    nobody want report pennsylvania michigan didnt...
4    pleased announce given full support endorsemen...
Name: tweet, dtype: object

In [36]:
string=' '.join(df_Trumptweets['tweet'])
# join all title into a string

In [37]:
lst=string.split()
#split the word

In [38]:
srs = pd.Series(lst) 
#store the splited word vertically

In [39]:
srs.value_counts()
#word count of each 

vote          18
election      17
win           10
state         10
ballot        10
              ..
true           1
secretary      1
stock          1
adamlaxalt     1
open           1
Length: 468, dtype: int64

In [40]:
srs.value_counts()[:15] 
#show the top 15

vote            18
election        17
win             10
state           10
ballot          10
president        8
pennsylvania     7
vaccine          7
big              7
u                7
allowed          6
would            6
trump            6
legal            6
senate           5
dtype: int64

In [41]:
# common words screening 
freq_common = pd.Series(' '.join(df_Trumptweets['tweet']).split()).value_counts()[:15]
freq_common

vote            18
election        17
win             10
state           10
ballot          10
president        8
pennsylvania     7
vaccine          7
big              7
u                7
allowed          6
would            6
trump            6
legal            6
senate           5
dtype: int64

In [51]:
# rare words screening
freq_rare = pd.Series(' '.join(
    df_Trumptweets['tweet']).split()).value_counts()[-15:]
freq_rare

determined        1
thejusticedept    1
another           1
miraculously      1
seanhannity       1
deplorable        1
cast              1
flagrantly        1
ken               1
quinnipiacpoll    1
true              1
secretary         1
stock             1
adamlaxalt        1
open              1
dtype: int64

# TF-IDF

In [43]:
#distinguishing Trump's tweets with others
tf1 = df_Trumptweets['tweet'].apply(lambda x: pd.value_counts(x.split(" "))).sum(
    axis=0).reset_index()

tf1.columns = ['words', 'tf']
tf1.sort_values(['tf'], ascending=False).head(15)

Unnamed: 0,words,tf
0,vote,18.0
24,election,17.0
176,ballot,10.0
68,win,10.0
16,state,10.0
13,president,8.0
6,pennsylvania,7.0
228,vaccine,7.0
123,big,7.0
84,u,7.0
