In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob

In [2]:
# Import dataset
tweet_list = pd.read_csv('trumptweets_downloaded.csv')

# understand structure of dataset
print(tweet_list.shape)
print(tweet_list.columns)

(41122, 9)
Index(['id', 'link', 'content', 'date', 'retweets', 'favorites', 'mentions',
       'hashtags', 'geo'],
      dtype='object')


In [3]:
# removing columns which are not important for sentiment analysis
remove_columns = ['id', 'link', 'date', 'retweets', 'favorites', 'mentions', 'hashtags', 'geo']
df = pd.DataFrame(tweet_list.drop(remove_columns, axis=1, inplace=False))

In [4]:
df.shape

(41122, 1)

# Data pre-processing for textual variables

## Lowercasing

In [5]:
## Change the reviews type to string
df['content'] = df['content'].astype(str)

## Before lowercasing 
df['content'][2]

'Donald Trump reads Top Ten Financial Tips on Late Show with David Letterman: http://tinyurl.com/ooafwn - Very funny!'

In [6]:
## Lowercase all reviews
df['content'] = df['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['content'][2] ## to see the difference

'donald trump reads top ten financial tips on late show with david letterman: http://tinyurl.com/ooafwn - very funny!'

## Special Characters

In [7]:
# remove punctuation
df['content'] = df['content'].str.replace('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)','')
df['content'][2]

'donald trump reads top ten financial tips on late show with david letterman   very funny'

## Stopwords

In [8]:
stop = stopwords.words('english')
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['content'][2]

'donald trump reads top ten financial tips late show david letterman funny'

## Sentiment Score

In [13]:
## Define a function which can be applied to calculate the score for the whole dataset

def senti(x):
    return TextBlob(x).sentiment  

df['senti_score'] = df['content'].apply(senti)

df.senti_score.head(6)

0     (0.13999999999999999, 0.3977777777777778)
1    (0.13636363636363635, 0.45454545454545453)
2     (0.06666666666666667, 0.3666666666666667)
3     (0.06818181818181818, 0.7272727272727273)
4                                    (0.0, 0.0)
5                                    (0.0, 0.0)
Name: senti_score, dtype: object

In [14]:
df.to_csv('sentiment.csv', index=False, sep=';')

combined_list = pd.merge(tweet_list, df[['content', 'senti_score']], on='content')

In [16]:
combined_list.head(5)

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags,geo,senti_score
