In [31]:
import pandas as pd 
import numpy as np
import string, re
import nltk
from sklearn.cross_validation import train_test_split
import operator
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time


%matplotlib inline

In [26]:
data = pd.read_csv('data/first-gop-debate-twitter-sentiment/sentiment.csv')
data.head(5)

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [5]:
# List of positive and negative tweets
sad = [':‑(', ':(', ':‑c', ':c', ':‑<', ':<', ':‑[' ,':[', ':-||', '>:[', ':{', ':@', '>:(']
Positive = [':‑)',':)', ':-]', ':]',':-3', ':3', ':->', ':>' ,'8-)', '8)',':-}', ':}', ':o)', ':c)', ':^)' ,'=]', '=)'
           ,':‑D', ':D', '8‑D', '8D', 'x‑D', 'xD', 'X‑D', 'XD', '=D', '=3', 'B^D']

In [28]:
stop_list = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
# rt - stands for retweet
stop_list = stop_list +["rt", 'url']

# regex for capturing tweets
reg = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
classifier =[]
def preprocess(tweet):
    # only processing if the the value is a string
    if type(tweet)!=type(2.0):
        tweet = tweet.decode('utf-8').strip()
        tweet = tweet.lower()
        # Removing hashtags
        tweet = " ".join(tweet.split('#'))
        # Removing URLs
        tweet = re.sub('((www\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub("(http\S+)|(https\S+)", '', tweet)
        # Removing User mentions
        tweet = re.sub('@[^\s]+','',tweet)
        tweet = tweet.strip('\'"')
        # Removing stop words - This can be moved to count vectorization
        tweet  = " ".join([word for word in tweet.split(" ") if word not in stop_list])
        # lemmatizing words 
        tweet = " ".join([lemmatizer.lemmatize(word) for word in tweet.split(" ")])
    else:
        tweet=''
    return tweet

def extractEmoticons(tweet):
    return " , ".join(re.findall(reg, tweet))
def removeEmoticons(tweet):
    return re.sub(reg,'',tweet)

# data = data.dropna()
data['processed_text'] = data.text.apply(preprocess)

#getting the emoticons from the cleaned data
data['emoticons'] = data['processed_text'].apply(extractEmoticons)

# Removing emoticons from the text data
data['processed_text'] = data['processed_text'].apply(removeEmoticons)

In [44]:
vectorizer = TfidfVectorizer(stop_words=stop_list)
X = vectorizer.fit_transform(data.processed_text)

'''
print("Performing dimensionality reduction using LSA")
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(100000) # 1000 is randomly chosen
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

'''

In [45]:
km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(3):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, -10:]:
        print(' %s' % terms[ind])
    print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=3, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=True)
Initialization complete
Iteration  0, inertia 26033.413
Iteration  1, inertia 13526.865
Iteration  2, inertia 13502.945
Iteration  3, inertia 13493.528
Iteration  4, inertia 13482.317
Iteration  5, inertia 13467.732
Iteration  6, inertia 13457.019
Iteration  7, inertia 13454.117
Iteration  8, inertia 13451.514
Iteration  9, inertia 13448.419
Iteration 10, inertia 13444.979
Iteration 11, inertia 13438.806
Iteration 12, inertia 13438.765
Iteration 13, inertia 13434.086
Iteration 14, inertia 13433.955
Iteration 15, inertia 13433.828
Iteration 16, inertia 13433.816
Converged at iteration 16
done in 0.692s
()
Cluster 0:
 rutgers
 annoyed
 sadclown
 fright
 freeman
 catoinstitute
 salty
 foxtv
 anniv
 newlow
()
Cluster 1:
 parakeets
 paragraph
 paradise
 parade
 par
 paper
 pants
 panning
 panic
 00

In [48]:
len(order_centroids[0, :])

11199

In [49]:
len(order_centroids[1, :])

11199

In [50]:
len(order_centroids[2, :])

11199

In [52]:
len(terms)

11199