# Import & Review Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns


plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
file = './data/trumptweets_all.csv'


In [3]:
trumptweets = pd.read_csv(file, encoding = 'latin1',
                          index_col='created_at')

In [4]:
trumptweets.index = pd.to_datetime(trumptweets.index)

In [5]:
# Only analyzing data since the 2016 Republican National Convention on 7/18/2016

trumptweets = trumptweets.loc[trumptweets.index > '2016-7-18 00:00:00', :]

In [6]:
trumptweets.shape

(4675, 6)

# Prepare Data for Analysis

In [7]:
#add new column for HOUR when the tweet occurred

trumptweets.loc[:, 'tweet_hour'] = trumptweets.index.hour

In [9]:
trumptweets.loc[:, 'source'].value_counts()

Twitter for iPhone     3341
Twitter for Android     861
Twitter Web Client      195
Media Studio            130
Twitter Ads              94
Twitter for iPad         51
TweetDeck                 2
Periscope                 1
Name: source, dtype: int64

In [10]:
# Convert Source column to dummy variables

#map source twitter for the 19 source values into integers

trumptweets.loc[:, 'source_num'] = trumptweets.loc[:,'source'].map({'Twitter for iPhone':0, 
                                                                    'Twitter for Android':1, 
                                                                    'Twitter Web Client':2, 
                                                                    'Media Studio':3, 
                                                                    'Twitter Ads':4, 
                                                                    'Twitter for iPad':5, 
                                                                    'TweetDeck':6, 
                                                                    'Periscope':7, 
                                                                    })

In [11]:
trumptweets.head()

Unnamed: 0_level_0,source,text,retweet_count,favorite_count,is_retweet,id_str,tweet_hour,source_num
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-12-30 22:18:00,Twitter for Android,Russians are playing @CNN and @NBCNews for suc...,23213,84254,False,814958820980039681,22,1
2016-12-30 19:46:00,Twitter for iPhone,Join @AmerIcan32 founded by Hall of Fame legen...,7366,25336,False,814920722208296960,19,0
2016-12-30 19:41:00,Twitter for Android,Great move on delay (by V. Putin) - I always k...,34415,97669,False,814919370711461890,19,1
2016-12-29 14:54:00,Twitter for iPhone,My Administration will follow two simple rules...,11330,45609,False,814484710025994241,14,0
2016-12-28 22:06:00,Twitter for iPhone,'Economists say Trump delivered hope' https://...,13919,51857,False,814231064847728640,22,0


In [12]:
#Convert Source_num to dummy variables.  Twitter for iPhone is src=0

dummy_trumptweets = pd.get_dummies(trumptweets, columns=['source_num']).drop('source_num_0', axis=1)

In [None]:
'Twitter for iPhone':0, 
                                                                    'Twitter for Android':1, 
                                                                    'Twitter Web Client':2, 
                                                                    'Media Studio':3, 
                                                                    'Twitter Ads':4, 
                                                                    'Twitter for iPad':5, 
                                                                    'TweetDeck':6, 
                                                                    'Periscope':7, 

In [14]:
# Rename Dummy source_num columns

dummy_trumptweets.rename(columns={'source_num_1': 'src_android',
                                  'source_num_2': 'src_webclient',
                                  'source_num_3': 'src_mediastudio',
                                  'source_num_4': 'src_twitterads',
                                  'source_num_5': 'src_ipad',
                                  'source_num_6': 'src_tweetdeck',
                                  'source_num_7': 'src_periscope'},
                                  inplace=True)

In [16]:
#Create new column, engagement, that sums retweet and favorite counts.  This will become my target y

dummy_trumptweets.loc[:, 'engagement'] = (dummy_trumptweets.loc[:, 'retweet_count'] +
                                          dummy_trumptweets.loc[:, 'favorite_count'])

In [17]:
# Convert hours to morning, afternoon, evening, latenight dummy variables.  Timestamp column is in GMT, so converting daytime hours to EST
# Default is morning


# EST 12-6pm is GMT 18-23
dummy_trumptweets.loc[:, 'is_afternoon'] = ((dummy_trumptweets.loc[:, 'tweet_hour'] >= 17) & (dummy_trumptweets.loc[:,'tweet_hour'] < 23)).astype(int)  

# EST 6pm-12am is 23 - 5
dummy_trumptweets.loc[:, 'is_evening'] = ((dummy_trumptweets.loc[:, 'tweet_hour'] >= 23) | (dummy_trumptweets.loc[:,'tweet_hour'] < 5)).astype(int)

# Late Night is EST 12am-5am is GMT 5-10
dummy_trumptweets.loc[:, 'is_latenight'] = ((dummy_trumptweets.loc[:, 'tweet_hour'] >= 5) & (dummy_trumptweets.loc[:,'tweet_hour'] < 10)).astype(int)


In [18]:
# try log(y)
# caution on interpretation of coefficients

# Because Log(0) returns infinite, set engagement to 1 where it was previously 0.
# Justifying this because 0 and 1 is insignificant as a low engagement tweets

zero_tweets = dummy_trumptweets.loc[:,'engagement'] == 0
dummy_trumptweets.loc[zero_tweets, 'engagement'] = 1

In [19]:
# add new column for log(engagement)

dummy_trumptweets.loc[:, 'log_engagement'] = np.log(dummy_trumptweets.loc[:,'engagement'])

# Natural Language Processing

In [23]:
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

ModuleNotFoundError: No module named 'textblob'

In [None]:
tweet = TextBlob(dummy_trumptweets.loc[:,'text'][100])

In [None]:
# conver to all lower case

tweet.lower()

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
# stemming  off 's', 'es', 'ing'
[stemmer.stem(text) for text in tweet.words.lower()]

In [None]:
#lemmatize words - more advanced form of deriving the root
[text.lemmatize() for text in tweet.words.lower()]

In [None]:
#lemmatize words - but assume words is a verb
[text.lemmatize(pos='v') for text in tweet.words.lower()]

# Sentiment Analysis

In [None]:
# Define a function that accepts text and returns the polarity

def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [None]:
# Create new sentiment column to the dataframe
dummy_trumptweets.loc[:, 'sentiment'] = dummy_trumptweets.loc[:, 'text'].apply(detect_sentiment)


In [None]:
#Bin Sentimentl into dummy variables

In [None]:
# dummy_trumptweets.loc[:, 'high_positive_sentiment'] = np.where(dummy_trumptweets.loc[:, 'sentiment'] > 0.5, 1, 0)

In [None]:
dummy_trumptweets.loc[:, 'moderately_positive_sentiment'] = np.where((dummy_trumptweets.loc[:, 'sentiment'] >= 0) & (dummy_trumptweets.loc[:, 'sentiment'] < .5), 1, 0)

In [None]:
dummy_trumptweets.loc[:, 'moderately_negative_sentiment'] = np.where((dummy_trumptweets.loc[:, 'sentiment'] >= -.5) & (dummy_trumptweets.loc[:, 'sentiment'] < 0), 1, 0)

In [None]:
dummy_trumptweets.loc[:, 'very_negative_sentiment'] = np.where((dummy_trumptweets.loc[:, 'sentiment'] >= -1) & (dummy_trumptweets.loc[:, 'sentiment'] < -.5), 1, 0)

In [None]:
#Testing Polynomial Features on Sentiment

from sklearn.preprocessing import PolynomialFeatures

X = (dummy_trumptweets.loc[:, ['sentiment']])

pf = PolynomialFeatures(degree=3, include_bias=False)
pf.fit(X)
X = pf.transform(X)

In [None]:
X

In [None]:
lr = LinearRegression()
lr.fit(X, y)

y_fit = lr.predict(X)


In [None]:
plt.plot(dummy_trumptweets.loc[:, 'sentiment'].values, y_fit, color='r')
plt.scatter(dummy_trumptweets.loc[:, 'sentiment'], y)

# Analysis and Visualizations

In [None]:
dummy_trumptweets.source.value_counts()

In [None]:
#describe all

dummy_trumptweets.describe(include='all')

In [None]:
#boxplot of log(engagement)

dummy_trumptweets.loc[:,'log_engagement'].plot(kind='box')

In [None]:
#Histogram of log(engagement)

dummy_trumptweets.loc[:, 'log_engagement'].hist(bins=100)

In [None]:
#Histogram of tweet_hour 

dummy_trumptweets.loc[:, 'tweet_hour'].hist(bins=24)

In [None]:
dummy_trumptweets.columns

In [None]:



feature_cols = ['is_retweet',
                'src_web_client', 
                'src_iphone', 
                'src_tweetdeck',
                'src_twitlonger', 
                'src_instagram', 
                'src_mediastudio', 
                'src_facebook',
                'src_bberry', 
                'src_twitter_ads', 
                'src_mobilewebm5', 
                'src_ipad',
                'src_twitlonger', 
                'src_qanda', 
                'src_vine', 
                'src_periscope',
                'src_neatly_bberry', 
                'src_mirror_ipad', 
                'src_forwebsites', 
                'is_afternoon', 
                'is_evening', 
                'is_latenight', 
                'moderately_positive_sentiment',
                'moderately_negative_sentiment', 
                'very_negative_sentiment']

In [None]:
feature_cols = ['is_retweet', 
                'is_afternoon', 
                'is_evening', 
                'is_latenight', 
                'moderately_positive_sentiment',
                'moderately_negative_sentiment', 
                'very_negative_sentiment']

In [None]:
sns.pairplot(dummy_trumptweets,
            x_vars=feature_cols,
            y_vars='log_engagement',
            kind='reg',
            plot_kws={'scatter_kws': {'s':1, 'alpha':.3},
                      'line_kws': {'color':'red'}
                     }
            );

In [None]:
#Insert Correlation Matrix with HeatMap

sns.heatmap(dummy_trumptweets.corr(),
            vmin=-1,
            vmax = 1,
            cmap = sns.diverging_palette(220, 10, n=21),
            );

In [None]:
#Review Scatter Plots

y = dummy_trumptweets.loc[:, 'log_engagement']

for var in feature_cols:
    print(var)
    x = dummy_trumptweets.loc[:, var]
    plt.scatter(x, y)
    plt.show()

# Linear Regression

In [None]:
X = dummy_trumptweets.loc[:, feature_cols]
y = dummy_trumptweets.loc[:, 'log_engagement']


In [None]:
#Split Train and Test

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# model on the training set of data

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
# predict on the test set for X

y_pred = lr.predict(X_test)

In [None]:
# compare predicted results against the test set

#print(metrics.mean_squared_error(y_train, lr.predict(X_train)))

print(metrics.mean_squared_error(y_test, y_pred))

In [None]:
#Create Benchmark of Mean Values

y_pred_null = np.ones(y_test.shape) * y_train.mean()

#compare y_test against the y_means
metrics.mean_squared_error(y_test, y_pred_null)

In [None]:
print(lr.intercept_)
print(lr.coef_)