In [None]:
!pip uninstall nltk

In [1]:
# import the libraries

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd 
from textblob import TextBlob
import re
from collections import Counter
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 


In [2]:
# Reading the dataset

data= pd.read_csv("trumptweets.csv", sep=',')

In [3]:
data.head()

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags,geo
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 20:54:25,500,868,,,
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-05 03:00:10,33,273,,,
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 15:38:08,12,18,,,
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 22:40:15,11,24,,,
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 16:07:28,1399,1965,,,


In [4]:
# test

data.content.duplicated().sum()

213

In [5]:
# drop duplicates tweets 

data.drop_duplicates(subset=['content'],inplace=True)

In [6]:
# test

data.content.duplicated().sum()

0

In [7]:
# The shape of dataset

data.shape

(40909, 9)

In [8]:
# Extract content column for all tweets, and put it in a new data frame.

df= pd.DataFrame([content  for content in data['content']], columns=['Tweets'])
df.shape

(40909, 1)

In [15]:
df['Date']= data['date']

In [16]:
df.shape

(40909, 2)

In [9]:
# Chack if there are any duplicated tweets 

len(df)-len(df.drop_duplicates())

0

In [10]:
# Drop the duplicates values

df.drop_duplicates(inplace=True)

In [11]:
# Check about missing values

df.shape[0]- df.count()

Tweets    0
dtype: int64

In [12]:
# Remove emoji from tweets

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [13]:
# Clean the text

# Create a function to clean the tweets
def cleanText(text):
    text= re.sub(r'@[A-Za-z0-9]+', '', text) #Removed @mentions
    text= re.sub(r'#', '', text) #Removing the '#' symbol
    text= re.sub(r'RT[\s]+', '', text) #Removed RT
    text= re.sub(r'http\S+', '', text) #Removed all hyperlink
    text= remove_emoji(text)

    return text

In [14]:
# Clean the text

df['Tweets']= df['Tweets'].apply(cleanText)
print('Done')

Done


In [17]:
# Create Subjectivity value and Polarity for each tweet

def sentiment(tweets):
    try:
        return TextBlob(tweets).sentiment
    except:
        return None

# Create Subjectivity value for each tweet

df['Polarity']= df['Tweets'].apply(sentiment).apply(lambda x: x[0])

# Create Polarity value for each tweet

df['Subjectivity'] = df['Tweets'].apply(sentiment).apply(lambda x: x[1])

# Test

df.head()

# Source:
# https://medium.com/@himanshu_23732/sentiment-analysis-with-textblob-6bc2eb9ec4ab

Unnamed: 0,Tweets,Date,Polarity,Subjectivity
0,Be sure to tune in and watch Donald Trump on L...,2009-05-04 20:54:25,0.175,0.497222
1,Donald Trump will be appearing on The View tom...,2009-05-05 03:00:10,0.170455,0.454545
2,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 15:38:08,0.12125,0.42
3,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 22:40:15,0.136364,0.454545
4,"""My persona will never be that of a wallflower...",2009-05-12 16:07:28,0.0,0.0


In [18]:
data.to_csv("trumptweets_cleaned.csv")
df.to_csv("tweets_subjectivity_polarity_dataset.csv")

In [None]:
# Concatenate all tweets in one str variable

def to_one_str(text):
    all_Tweets=''
    for tweet in text:
        all_Tweets+= ' '+tweet
    return all_Tweets
allTweets= to_one_str(df.Tweets)
print("Done")

In [None]:
with open('allTweets_as_oneString.txt',encoding='utf-16', mode='w') as f:
    f.write(str(allTweets))
print(allTweets[:500])

In [None]:

# Convert all word to lower case


def modify_str(text):
    # Convert all word to lower case
    text= text.lower()
    
    # Remove punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

allTweets = modify_str(allTweets)
allTweets[:500]

In [None]:
# Drop StopWord from AllTweets 

def dropStopWord(text):
    stop_words = set(stopwords.words('english')) 
  

    word_tokens = word_tokenize(text) 
    
    filtered = [word for word in word_tokens if word not in stop_words]
    

    
    return filtered

filtered_sentence= dropStopWord(allTweets)

In [None]:
sentences = ' '.join(map(str, filtered_sentence)) 
sentences= sentences.replace('’','')
sentences= sentences.replace(' …','')   
sentences= sentences.replace('‘','')
sentences= sentences.replace('”','')
sentences= sentences.replace('“','')
with open('allTweets_without_stopword.txt',encoding='utf-16', mode='w') as f:
    f.write(str(sentences))
print("DONE")

In [None]:
# Counter of frequency of all origin words


def counterOfFreq(text):
    
    counter_word= Counter(text.split())
    counter_word= counter_word.most_common()
    word_freq={}
    for i in counter_word:
        word_freq[i[0]] = i[1]
    word_freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Freq'])
    word_freq_df.to_csv("word_freq_df.csv")
    return word_freq_df
freq_df= counterOfFreq(sentences)


In [None]:
'Done'