# Text analysis of Biden's tweets

Stylometry: to find out the style of writing or frequency of words from Mr. Joe Biden

In [1]:
import pandas as pd

In [2]:
#read the data file we scraped from Donald Trump's twitter account
df = pd.read_csv('Biden(processed).csv', encoding='utf8')
#display the first ten row
df.head(10)

Unnamed: 0,type,id,date,tweet,comments,retweets,likes
0,fact,1326968002839908353,2020-11-12 19:20:00+00:00,I extend my deep condolences to the loved ones...,33000,5000,497000
1,fact,1326557341697839106,2020-11-11 16:08:11+00:00,"Today, we honor the service of those who have ...",8700,35000,363000
2,fact,1326344141446373376,2020-11-11 02:01:00+00:00,We are going to build a health care system tha...,24700,4000,437000
3,fact,1326306141056364544,2020-11-10 23:30:00+00:00,"When I’m speaking to foreign leaders, I’m tell...",37700,16500,632200
4,fact,1326284750420643846,2020-11-10 22:05:00+00:00,"Come January, we will work quickly with Congre...",12600,227000,305300
5,fact,1326266378991542272,2020-11-10 20:52:00+00:00,"Beginning on January 20th, Vice President-elec...",15500,33200,444500
6,fact,1325890910635384839,2020-11-09 20:00:01+00:00,"I won't be president until January 20th, but m...",161700,23900,15000
7,fact,1325885871875190784,2020-11-09 19:40:00+00:00,The bottom line: I will spare no effort to tur...,15900,29800,488500
8,fact,1325880083618426881,2020-11-09 19:17:00+00:00,The challenge before us right now is still imm...,7700,14400,248300
9,fact,1325873288711712769,2020-11-09 18:50:00+00:00,My COVID-19 Transition Advisory Board will adv...,4500,14200,197700


In [3]:
#create data frame for the tweets
df_Bidentweets = pd.DataFrame(df['tweet'], columns=["tweet"])

In [4]:
#display the first ten row
df_Bidentweets.head(10)

Unnamed: 0,tweet
0,I extend my deep condolences to the loved ones...
1,"Today, we honor the service of those who have ..."
2,We are going to build a health care system tha...
3,"When I’m speaking to foreign leaders, I’m tell..."
4,"Come January, we will work quickly with Congre..."
5,"Beginning on January 20th, Vice President-elec..."
6,"I won't be president until January 20th, but m..."
7,The bottom line: I will spare no effort to tur...
8,The challenge before us right now is still imm...
9,My COVID-19 Transition Advisory Board will adv...


In [5]:
df_Bidentweets['word_count'] = df_Bidentweets['tweet'].apply(lambda x: len(str(x).split(" ")))
# count the word for each column
df_Bidentweets[['tweet', 'word_count']].head(10)
#[['titles', 'word_count']] is used to set the format of the table, the row

Unnamed: 0,tweet,word_count
0,I extend my deep condolences to the loved ones...,48
1,"Today, we honor the service of those who have ...",49
2,We are going to build a health care system tha...,25
3,"When I’m speaking to foreign leaders, I’m tell...",23
4,"Come January, we will work quickly with Congre...",25
5,"Beginning on January 20th, Vice President-elec...",29
6,"I won't be president until January 20th, but m...",18
7,The bottom line: I will spare no effort to tur...,13
8,The challenge before us right now is still imm...,23
9,My COVID-19 Transition Advisory Board will adv...,29


In [6]:
#removing empty values if any
df_Bidentweets['tweet'] = df_Bidentweets['tweet'].fillna("")

# Normalization

In [7]:
#split the title, turn every word into lower case and join them together
df_Bidentweets['tweet'] = df_Bidentweets['tweet'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))
#display the first ten row
df_Bidentweets['tweet'].head(10)

0    i extend my deep condolences to the loved ones...
1    today, we honor the service of those who have ...
2    we are going to build a health care system tha...
3    when i’m speaking to foreign leaders, i’m tell...
4    come january, we will work quickly with congre...
5    beginning on january 20th, vice president-elec...
6    i won't be president until january 20th, but m...
7    the bottom line: i will spare no effort to tur...
8    the challenge before us right now is still imm...
9    my covid-19 transition advisory board will adv...
Name: tweet, dtype: object

In [8]:
#removing punctuations
df_Bidentweets['tweet'] = df_Bidentweets['tweet'].str.replace('[^\w\s]', '')
df_Bidentweets['tweet'].head(10)

0    i extend my deep condolences to the loved ones...
1    today we honor the service of those who have w...
2    we are going to build a health care system tha...
3    when im speaking to foreign leaders im telling...
4    come january we will work quickly with congres...
5    beginning on january 20th vice presidentelect ...
6    i wont be president until january 20th but my ...
7    the bottom line i will spare no effort to turn...
8    the challenge before us right now is still imm...
9    my covid19 transition advisory board will advi...
Name: tweet, dtype: object

In [9]:
import nltk

In [10]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
#import the stop words list

In [11]:
stop
#A list to know what are stop words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
# removing stop words(word has no specific meaning eg: on, the)
df_Bidentweets['tweet'] = df_Bidentweets['tweet'].apply(
    lambda x: " ".join(x for x in x.split() if x not in stop))
df_Bidentweets['tweet'].head(10)

0    extend deep condolences loved ones peacekeeper...
1    today honor service worn uniform armed forces ...
2    going build health care system puts family fir...
3    im speaking foreign leaders im telling america...
4    come january work quickly congress dramaticall...
5    beginning january 20th vice presidentelect har...
6    wont president january 20th message today ever...
7        bottom line spare effort turn pandemic around
8    challenge us right still immense growing need ...
9    covid19 transition advisory board advise detai...
Name: tweet, dtype: object

In [13]:
import numpy as np
#removing number
df_Bidentweets['tweet'] = df_Bidentweets['tweet'].str.replace('\d+', '')
df_Bidentweets['tweet'].replace(' ', np.nan, inplace=True)

In [14]:
!pip install -U textblob 
#download textblob package
!python -m textblob.download_corpora

Requirement already up-to-date: textblob in d:\anaconda\lib\site-packages (0.15.3)
Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\pklok_gaming\AppData\Roaming\nltk_data...
[nltk_data]   Pa

In [15]:
from textblob import Word
#converts the word into its root word
df_Bidentweets['tweet'] = df_Bidentweets['tweet'].apply(
    lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df_Bidentweets['tweet'].head(10)

0    extend deep condolence loved one peacekeeper i...
1    today honor service worn uniform armed force u...
2    going build health care system put family firs...
3    im speaking foreign leader im telling america ...
4    come january work quickly congress dramaticall...
5    beginning january th vice presidentelect harri...
6    wont president january th message today everyo...
7        bottom line spare effort turn pandemic around
8    challenge u right still immense growing need b...
9    covid transition advisory board advise detaile...
Name: tweet, dtype: object

In [16]:
string=' '.join(df_Bidentweets['tweet'])
# join all title into a string

In [17]:
lst=string.split()
#split the word

In [18]:
srs = pd.Series(lst) 
#store the splited word vertically

In [19]:
srs.value_counts()
#word count of each 

american          10
u                 10
going             10
america            9
time               7
                  ..
endured            1
presidentelect     1
purpose            1
around             1
matter             1
Length: 296, dtype: int64

In [20]:
srs.value_counts()[:15] 
#show the top 15

american    10
u           10
going       10
america      9
time         7
nation       6
health       5
people       5
care         5
force        5
state        4
first        4
united       4
covid        4
one          4
dtype: int64

In [21]:
# common words screening 
freq_common = pd.Series(' '.join(df_Bidentweets['tweet']).split()).value_counts()[:15]
freq_common

american    10
u           10
going       10
america      9
time         7
nation       6
health       5
people       5
care         5
force        5
state        4
first        4
united       4
covid        4
one          4
dtype: int64

In [22]:
# rare words screening
freq_rare = pd.Series(' '.join(
    df_Bidentweets['tweet']).split()).value_counts()[-15:]
freq_rare

like              1
died              1
rhetoric          1
detailed          1
never             1
stood             1
cost              1
story             1
challenge         1
arizona           1
endured           1
presidentelect    1
purpose           1
around            1
matter            1
dtype: int64

# TF-IDF

In [23]:
#distinguishing Biden's tweets with others
tf2 = df_Bidentweets['tweet'].apply(lambda x: pd.value_counts(x.split(" "))).sum(
    axis=0).reset_index()

tf2.columns = ['words', 'tf']
tf2.sort_values(['tf'], ascending=False).head(15)

Unnamed: 0,words,tf
0,american,10.0
47,going,10.0
94,u,10.0
53,america,9.0
181,time,7.0
132,nation,6.0
175,people,5.0
30,force,5.0
46,care,5.0
50,health,5.0
