# <img style="float: left; padding-right: 10px; width: 45px" src="https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/iacs.png"> CS109A Introduction to Data Science: 

## Final Project: Trumps Tweets

**Group Members**: Sarah Chin, Maleah Fekete, Mason Watson, Jasper Fu

In [1]:
# RUN THIS CELL FOR FORMAT
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)

import random
random.seed(112358)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

from datetime import datetime

from sklearn import tree
%matplotlib inline



from gensim import models

In [109]:
trump_tweets = pd.read_csv("data/trump-tweets/trump_tweet.csv")

# Data Inspection and Cleaning 

In [11]:
trump_tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,It was my great honor to deliver the keynote a...,10-25-2019 19:51:43,5132.0,16938,False,1.187819e+18
1,Twitter for iPhone,Thank you @robertjeffress! https://t.co/o6mk8o...,10-25-2019 17:57:25,6095.0,22287,False,1.18779e+18
2,Twitter for iPhone,RT @GaryCoby: Today @realDonaldTrump is receiv...,10-25-2019 17:35:13,5810.0,0,True,1.187785e+18
3,Twitter for iPhone,Heading to South Carolina! https://t.co/CORtaP...,10-25-2019 17:30:21,8900.0,40776,False,1.187783e+18
4,Twitter for iPhone,RT @gatewaypundit: Breaking Poll: 52% Say Impe...,10-25-2019 16:22:09,7649.0,0,True,1.187766e+18


In [None]:
# Drop this entry because it provides no information under the analyses we are currently performing
trump_tweets = trump_tweets[~trump_tweets.text.isnull()]

In [None]:
display(trump_tweets[trump_tweets.retweet_count.isnull()])
display(trump_tweets[trump_tweets.favorite_count.isnull()])
display(trump_tweets[trump_tweets.created_at.isnull()])
display(trump_tweets[trump_tweets.id_str.isnull()])

# The entries with null retweet_count are the same as the entries will
# null favorite_count and null created_at
# These are also included in the entries with null id_str 
# We will group all of these into a separate dataframe to deal with null entries

In [None]:
# Create new dataframe with null values
trump_tweets_nulls = trump_tweets[trump_tweets.id_str.isnull()]

# Drop indices with a lot of null entries from the main dataframe
trump_tweets_main = trump_tweets[~trump_tweets.id_str.isnull()]

# Check. These should all display an empty dataframe
display(trump_tweets_main[trump_tweets_main.retweet_count.isnull()])
display(trump_tweets_main[trump_tweets_main.favorite_count.isnull()])
display(trump_tweets_main[trump_tweets_main.created_at.isnull()])
display(trump_tweets_main[trump_tweets_main.id_str.isnull()])

In [None]:
display(trump_tweets[trump_tweets.id_str.isnull() == True])

Just for this EDA we will ignore the 3 columns with all null non-text entries and the 3 columns where the categories got mixed up. We will only use data_main.

In [None]:
trump_tweets_main.dtypes

We notice that favorite_count is an object instead of an number, so we need to convert it before continuing with EDA. We will also change created_at to a DateTime object

In [None]:
trump_tweets_main.is_retweet.shape

In [None]:
print(trump_tweets_main[trump_tweets_main['is_retweet'] == 'true'].shape)
print(trump_tweets_main[trump_tweets_main['is_retweet'] == 'false'].shape)

In [None]:
trump_tweets_main[(trump_tweets_main['is_retweet'] != 'true') & (trump_tweets_main['is_retweet'] != 'false')].head()

Looking at the first of these on the actual platform, it is not a retweet, but will need to do more probabilistic analysis to say with certainty. It does not make sense to just drop these rows as they were all created in a similar time frame so this could skew results.

# EDA

In [None]:
trump_tweets_main["created_at"] = pd.to_datetime(trump_tweets_main['created_at'], errors='coerce')
trump_tweets_main["favorite_count"] = pd.to_numeric(trump_tweets_main["favorite_count"])

scaler = MinMaxScaler()
trump_tweets_main[['favorite_count', 'retweet_count']] = scaler.fit_transform(trump_tweets_main[['favorite_count', 'retweet_count']])

trump_tweets_main["fbyr"] = np.log(trump_tweets_main["favorite_count"] / trump_tweets_main["retweet_count"])

trump_tweets_main.dtypes

In [None]:
trump_retweets = trump_tweets_main[trump_tweets_main.is_retweet == 'true']
trump_not_retweets = trump_tweets_main[trump_tweets_main.is_retweet == 'false']

In [None]:
trump_retweets.describe()

In [None]:
trump_not_retweets.describe()

We create these for use in future analyses

In [None]:
plt.scatter(trump_tweets_main['created_at'], trump_tweets_main["retweet_count"])
plt.xlabel("date of tweet")
plt.ylabel("retweet count")
plt.title("retweet count over time")
plt.show()

plt.scatter(trump_tweets_main['created_at'], trump_tweets_main["favorite_count"])
plt.xlabel("date of tweet")
plt.ylabel("favorite count")
plt.title("favorite count over time")
plt.show()

plt.scatter(trump_tweets_main['created_at'], trump_tweets_main["fbyr"], )
plt.ylabel("favorite/retweet ratio (log)")
plt.xlabel("date of tweet")
plt.title("favorite/retweet ratio (log) count over time")
plt.show()

# Feature Engineering 

## Extraction of Usable Features from Tweet Data

In [13]:
w = models.KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)

In [107]:
trump_tweets[trump_tweets['is_retweet'] == 'false']["created_at"][0:15000]

0        10-25-2019 19:51:43
1        10-25-2019 17:57:25
3        10-25-2019 17:30:21
5        10-25-2019 12:54:24
6        10-25-2019 12:32:06
                ...         
17814    08-21-2015 20:01:32
17815    08-21-2015 16:35:21
17816    08-21-2015 15:35:18
17817    08-21-2015 15:25:01
17818    08-21-2015 14:32:43
Name: created_at, Length: 15000, dtype: object

In [137]:
stop_word_list = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
other_excluders = ["https", "thank", "co"]

trump_tweets = trump_tweets[trump_tweets['is_retweet'] == 'false'][0:15000]

# Tokenize tweets and filter stop words using stop word list
vectorizer = CountVectorizer(stop_words=stop_word_list + other_excluders)
text_preprocessor = vectorizer.build_analyzer()
tweet_vectors = np.zeros((15000, 300))
tweet_text = trump_tweets['text'].dropna()

for i in range(tweet_text.shape[0]):
    tokenized_tweet = text_preprocessor(tweet_text.iloc[i])
    n = len(tokenized_tweet)
    word_vectors = []
    
    for j in range(n):
        try:
            word_vectors.append(w.get_vector(tokenized_tweet[j]))
        except:
            continue
    
    if len(word_vectors) >= 1:
        word_vectors = np.vstack(word_vectors)
        tweet_vectors[i] = word_vectors.mean(axis=0)

In [144]:
tweet_text

0        It was my great honor to deliver the keynote a...
1        Thank you @robertjeffress! https://t.co/o6mk8o...
3        Heading to South Carolina! https://t.co/CORtaP...
5        “Donald J. Trump is an absolutely historic Pre...
6        Turkey fully understands not to fire on the Ku...
                               ...                        
17814    Leaving for Mobile Alabama right now - can't b...
17815    Boston incident is terrible. We need energy an...
17816    @AmyMek Every Time I see @realDonaldTrump addr...
17817                       Great! https://t.co/oJ6sqHB3MA
17818    We are going to have a wild time in Alabama to...
Name: text, Length: 15000, dtype: object

In [138]:
from sklearn.cluster import KMeans 
from sklearn import metrics 
from scipy.spatial.distance import cdist

K = range(1,50, 5) 
inertias = []

for k in K: 
    kmeanModel = KMeans(n_clusters=k).fit(tweet_vectors) 
    inertias.append(kmeanModel.inertia_) 

plt.plot(K, inertias)

KeyboardInterrupt: 

In [139]:
kmeanModel = KMeans(n_clusters=10).fit(tweet_vectors) 

In [143]:
w.most_similar(positive=['Russia'], topn=10)

[('Ukraine', 0.7918287515640259),
 ('Moscow', 0.7575764656066895),
 ('Russian', 0.746496319770813),
 ('Belarus', 0.7303562760353088),
 ('Kremlin', 0.7048990726470947),
 ('Kazakhstan', 0.6979326009750366),
 ('Russians', 0.677611231803894),
 ('Biologist_Anatoly_Kochnev', 0.6745500564575195),
 ('Azerbaijan', 0.6726992726325989),
 ('Putin', 0.6636874675750732)]

tweet_text[20020]