In [1]:
import pandas as pd 
import numpy as np
import string, re
import nltk
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from time import time

%matplotlib inline

In [48]:
data = pd.read_csv('Data/tweets.txt', sep = ';~;', engine='python')
emoji_list = pd.read_csv('Data/emoji_table.txt', encoding='utf-8', index_col=0).index.values
SentimentEmoji = pd.read_csv('Data/Emoji_classification.csv', encoding='utf-8').dropna()
SentimentHashtags = pd.read_csv('Data/hashtags.csv', encoding='utf-8').dropna()
data.head(5)

Unnamed: 0,username,date,retweets,favorites,text,geo,mentions,hashtags,id,permalink
0,mitchellvii,2016-10-01 15:51,878,1216,"""Hillary attacked Trump for (allegedly) callin...",,,,"""782352194473459713""",https://twitter.com/mitchellvii/status/7823521...
1,noahsnab,2016-10-01 06:33,10938,20233,"""me: i hate michigan :(( detroit: polls zero p...",,,,"""782211911723151360""",https://twitter.com/noahsnab/status/7822119117...
2,dt_ads,2016-10-01 12:20,29,31,"""If you feel US workers should have jobs befor...",,,#AmericaFirst #dtmag,"""782299159571431424""",https://twitter.com/dt_ads/status/782299159571...
3,MikePenceVP,2016-10-01 13:14,471,415,"""Hillary called Trump Supporters ""deplorable"" ...",,,#BasementDwellers,"""782312789968814080""",https://twitter.com/MikePenceVP/status/7823127...
4,JuddLegum,2016-10-01 08:18,641,1603,"""16. Trump claimed Google was involved in a co...",,,,"""782238361046036480""",https://twitter.com/JuddLegum/status/782238361...


In [49]:
# List of positive and negative tweets
sad = [':‑(', ':(', ':‑c', ':c', ':‑<', ':<', ':‑[' ,':[', ':-||', '>:[', ':{', ':@', '>:(']
Positive = [':‑)',':)', ':-]', ':]',':-3', ':3', ':->', ':>' ,'8-)', '8)',':-}', ':}', ':o)', ':c)', ':^)' ,'=]', '=)'
           ,':‑D', ':D', '8‑D', '8D', 'x‑D', 'xD', 'X‑D', 'XD', '=D', '=3', 'B^D']
SentimentHashtags['HashtagSentiment'] = SentimentHashtags['HashtagSentiment'].map({'Positive':1, 'Negative':0})
SentimentEmoji['Sentiment'] = SentimentEmoji['Sentiment'].map({'Positive':1, 'Negative':-1, 'Neutral':0}).dropna()
SentimentHashtags['Directed'] = SentimentHashtags['Directed'].map({'T':1, 'H':0})

In [50]:
stop_list = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
# rt - stands for retweet
stop_list = stop_list +["rt", 'url']

# regex for capturing tweets
reg = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
emoticons = "|".join(map(re.escape, sad + Positive))

emoji_pattern = re.compile(u'('
    u'\ud83c[\udf00-\udfff]|'
    u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
    u'[\u2600-\u26FF\u2700-\u27BF])+', 
    re.UNICODE)
classifier =[]
# URL_Pat = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
def preprocess(tweet):
    # only processing if the the value is a string
    if type(tweet)!=type(2.0):
        tweet = tweet.decode('latin-1').encode("utf-8").decode('utf-8').strip()
        tweet = tweet.lower()
        # Removing hashtags
        tweet = " ".join(tweet.split('#'))
        # Removing URLs
        tweet = re.sub('((www\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub('((pic\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub(u'[a-zA-Z0-9./]+\.[a-zA-Z0-9./ ]+.*$','',tweet)
        tweet = re.sub("(http\S+)|(https\S+)", '', tweet)
        # Removing User mentions
        tweet = re.sub('@[^\s]+','',tweet)
        tweet = tweet.strip('\'"')
        # Removing stop words - This can be moved to count vectorization
        tweet  = " ".join([word for word in tweet.split(" ") if word not in stop_list])
        # lemmatizing words 
        tweet = " ".join([lemmatizer.lemmatize(word) for word in tweet.split(" ")])
    else:
        tweet=''
    return tweet

def extractEmoticons(tweet):
    # emoji = emoji_pattern.findall(tweet)
    emoji = []
    for emo in emoji_list:
        if emo in tweet:
            emoji.append(emo)
    
    # these are :) :-) and other stuff
    emoticons = re.findall(reg, tweet)
    return " , ".join(emoji + emoticons)
def removeEmoticons(tweet):
    return re.sub(reg,'',tweet)

# data = data.dropna()
data['processed_text'] = data.text.apply(preprocess)

#getting the emoticons from the cleaned data
data['emoticons'] = data['processed_text'].apply(extractEmoticons)

# Removing emoticons from the text data
data['processed_text'] = data['processed_text'].apply(removeEmoticons)

In [75]:
vectorizer = TfidfVectorizer(stop_words=stop_list)
X = vectorizer.fit_transform(data.processed_text)

0        hillary attacked trump (allegedly) calling wom...
1        me: hate michigan :(( detroit: poll zero perce...
2        feel u worker job foreigners, vote 4 donald tr...
3        hillary called trump supporter "deplorable" be...
4        16. trump claimed google involved conspiracy s...
5        social experiment: go donald trump 's page see...
6        clinton's remark young voter fuel new trump pi...
7        trump get 15,000 hillary 's 800 town, matters....
8        daughterâs heartfelt letter republican dadâ...
9        mom fool , gotta runny nose n i'm sniffling n ...
10       don't want donald trump win time don't want hi...
11       donald trump spent $733,100 "two america immig...
12       one's donald trump !  twitter.com/swear_trek/s...
13       jesse watters usa editorial board endorsement,...
14       raised classy??-->>wasserman schultz: donald t...
15       hillary : " trump pathetic! taking father's in...
16       lying trump say hillary clinton 'gave up' one-.

In [9]:
km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(3):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, -10:]:
        print(' %s' % terms[ind])
    print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=3, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=True)
Initialization complete
Iteration  0, inertia 26488.945
Iteration  1, inertia 13590.235
Iteration  2, inertia 13573.981
Iteration  3, inertia 13558.029
Iteration  4, inertia 13547.756
Iteration  5, inertia 13539.660
Iteration  6, inertia 13538.935
Iteration  7, inertia 13538.329
Iteration  8, inertia 13538.170
Iteration  9, inertia 13538.093
Iteration 10, inertia 13538.071
Iteration 11, inertia 13538.060
Iteration 12, inertia 13538.049
Iteration 13, inertia 13538.031
Iteration 14, inertia 13538.024
Iteration 15, inertia 13538.015
Iteration 16, inertia 13537.991
Iteration 17, inertia 13537.987
Iteration 18, inertia 13537.980
Iteration 19, inertia 13537.969
Iteration 20, inertia 13537.963
Iteration 21, inertia 13537.953
Iteration 22, inertia 13537.946
Iteration 23, inertia 13537.920
Iteration 24

In [83]:
HillaryTweets = data[data['processed_text'].str.contains('hillary', case = False)]
DonaldTweets = data[data['processed_text'].str.contains('trump', case = False)]

datasets = [HillaryTweets.copy(), DonaldTweets.copy()]
TrainSets = []
for i, dataset in enumerate(datasets):
    hashtags = datasets[i]['hashtags'].copy().str.split(' ').apply(pd.Series, 1).stack()
    hashtags.index = hashtags.index.droplevel(-1)
    datasets[i].drop('hashtags', axis=1, inplace=True)
    hashtags.name = 'hashtags'
    
    datasets[i] = datasets[i].join(hashtags.str.strip())
    
    emoticons = datasets[i]['emoticons'].copy().str.split(' ').apply(pd.Series, 1).stack()
    emoticons.index = emoticons.index.droplevel(-1)
    datasets[i].drop('emoticons', axis=1, inplace=True)
    emoticons.name = 'emoticons'
    datasets[i] = datasets[i].join(emoticons.str.strip())
    
    datasets[i] = pd.merge(datasets[i], SentimentHashtags[SentimentHashtags['Directed'] == i], on = 'hashtags', how='outer')
    datasets[i] = pd.merge(datasets[i], SentimentEmoji, on = 'emoticons', how='outer')
    datasets[i]['Sentiment'] = datasets[i]['HashtagSentiment'].add(datasets[i]['Sentiment'], fill_value = 0)
    TrainSets.append(datasets[i][['username', 'date', 'processed_text', 'Sentiment']].dropna().groupby(['processed_text', 'Sentiment']).max().reset_index())

In [91]:
datasets[0][datasets[0].Sentiment.isnull()]

Unnamed: 0,username,date,retweets,favorites,text,geo,mentions,id,permalink,processed_text,hashtags,emoticons,Count,HashtagSentiment,Directed,Sentiment
0,mitchellvii,2016-10-01 15:51,878.0,1216.0,"""Hillary attacked Trump for (allegedly) callin...",,,"""782352194473459713""",https://twitter.com/mitchellvii/status/7823521...,hillary attacked trump (allegedly) calling wom...,,,,,,
1,JuddLegum,2016-10-01 08:18,641.0,1603.0,"""16. Trump claimed Google was involved in a co...",,,"""782238361046036480""",https://twitter.com/JuddLegum/status/782238361...,16. trump claimed google involved conspiracy s...,,,,,,
2,mykal57,2016-10-01 16:59,0.0,0.0,"""SOCIAL EXPERIMENT: Go to Donald Trump 's page...",,,"""782369482610122752""",https://twitter.com/mykal57/status/78236948261...,social experiment: go donald trump 's page see...,,,,,,
3,monteromo08,2016-10-01 16:59,0.0,0.0,"""Clinton's remarks on young voters fuel new Tr...",,,"""782369463198887936""",https://twitter.com/monteromo08/status/7823694...,clinton's remark young voter fuel new trump pi...,,,,,,
4,asamjulian,2016-10-01 16:36,609.0,842.0,"""If Trump gets 15,000 to Hillary 's 800 in the...",,,"""782363604200849408""",https://twitter.com/asamjulian/status/78236360...,"trump get 15,000 hillary 's 800 town, matters....",,,,,,
5,JustNana620,2016-10-01 16:59,3.0,3.0,"""A daughter’s heartfelt letter on her Republic...",,,"""782369461810724864""",https://twitter.com/JustNana620/status/7823694...,daughterâs heartfelt letter republican dadâ...,,,,,,
6,halsteadg048,2016-10-01 16:59,17.0,16.0,"""Jesse Watters on USA Editorial Board Endorsem...",,,"""782369397419671552""",https://twitter.com/halsteadg048/status/782369...,"jesse watters usa editorial board endorsement,...",,,,,,
7,Tom_Francois,2016-10-01 16:59,10.0,8.0,"""Hillary : "" Trump is pathetic! Taking his fat...",,,"""782369393829224448""",https://twitter.com/Tom_Francois/status/782369...,"hillary : "" trump pathetic! taking father's in...",,,,,,
8,alicat505,2016-10-01 16:59,1.0,0.0,"""Trump , Again, Tries to Lie About Hillary 's ...",,,"""782369274790752256""",https://twitter.com/alicat505/status/782369274...,"trump , again, try lie hillary 's policies/bel...",,,,,,
9,ChelRBR,2016-10-01 16:58,6.0,20.0,"""@Always_ Trump The number of Hillary shills t...",,@Always_,"""782369225922994177""",https://twitter.com/ChelRBR/status/78236922592...,trump number hillary shill willing commit fra...,,,,,,


In [98]:
datasets[0][datasets[0].Sentiment.isnull()].sample(100).to_csv('Hillary.csv', encoding = 'utf-8');
datasets[1][datasets[1].Sentiment.isnull()].sample(100).to_csv('Trump.csv', encoding = 'utf-8');

In [51]:
data['processed_text'].to_csv('tweetH.csv', encoding = 'utf-8');

In [47]:
reg = u'[a-zA-Z0-9./]+\.[a-zA-Z0-9./ ]+.*$'
re.sub(reg,'','hillary court  blacklivesmatter  trump receives police union endorsement battleground ohio  breitbart.com/2016-president ial-race/2016/10/01/trump-receives-police-union-endorsement-battleground-ohio/ Ã¢Â€Â¦  pjnet  tcot ')

'hillary court  blacklivesmatter  trump receives police union endorsement battleground ohio  '