In [1]:
import pandas as pd 
import numpy as np
import string, re
import nltk
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from time import time

%matplotlib inline

In [2]:
data = pd.read_csv('tweets.txt', sep = ';~;')
emoji_list = pd.read_csv('emoji_table.txt', encoding='utf-8', index_col=0).index.values
data.head(5)

  if __name__ == '__main__':


Unnamed: 0,username,date,retweets,favorites,text,geo,mentions,hashtags,id,permalink
0,mitchellvii,2016-10-01 15:51,878,1216,"""Hillary attacked Trump for (allegedly) callin...",,,,"""782352194473459713""",https://twitter.com/mitchellvii/status/7823521...
1,noahsnab,2016-10-01 06:33,10938,20233,"""me: i hate michigan :(( detroit: polls zero p...",,,,"""782211911723151360""",https://twitter.com/noahsnab/status/7822119117...
2,dt_ads,2016-10-01 12:20,29,31,"""If you feel US workers should have jobs befor...",,,#AmericaFirst #dtmag,"""782299159571431424""",https://twitter.com/dt_ads/status/782299159571...
3,MikePenceVP,2016-10-01 13:14,471,415,"""Hillary called Trump Supporters ""deplorable"" ...",,,#BasementDwellers,"""782312789968814080""",https://twitter.com/MikePenceVP/status/7823127...
4,JuddLegum,2016-10-01 08:18,641,1603,"""16. Trump claimed Google was involved in a co...",,,,"""782238361046036480""",https://twitter.com/JuddLegum/status/782238361...


In [3]:
# List of positive and negative tweets
sad = [':‑(', ':(', ':‑c', ':c', ':‑<', ':<', ':‑[' ,':[', ':-||', '>:[', ':{', ':@', '>:(']
Positive = [':‑)',':)', ':-]', ':]',':-3', ':3', ':->', ':>' ,'8-)', '8)',':-}', ':}', ':o)', ':c)', ':^)' ,'=]', '=)'
           ,':‑D', ':D', '8‑D', '8D', 'x‑D', 'xD', 'X‑D', 'XD', '=D', '=3', 'B^D']
PHillaryHastags = ['#imwithher','#imwithhur', '#strongertogether']
NHillaryHastags = ['#crookedHillary']
PTrump = ['#TrumpPence', '#Trumptrain']

In [4]:
stop_list = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
# rt - stands for retweet
stop_list = stop_list +["rt", 'url']

# regex for capturing tweets
reg = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
emoticons = "|".join(map(re.escape, sad + Positive))

emoji_pattern = re.compile(u'('
    u'\ud83c[\udf00-\udfff]|'
    u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
    u'[\u2600-\u26FF\u2700-\u27BF])+', 
    re.UNICODE)
classifier =[]
def preprocess(tweet):
    # only processing if the the value is a string
    if type(tweet)!=type(2.0):
        tweet = tweet.decode('latin-1').encode("utf-8").decode('utf-8').strip()
        tweet = tweet.lower()
        # Removing hashtags
        tweet = " ".join(tweet.split('#'))
        # Removing URLs
        tweet = re.sub('((www\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub("(http\S+)|(https\S+)", '', tweet)
        # Removing User mentions
        tweet = re.sub('@[^\s]+','',tweet)
        tweet = tweet.strip('\'"')
        # Removing stop words - This can be moved to count vectorization
        tweet  = " ".join([word for word in tweet.split(" ") if word not in stop_list])
        # lemmatizing words 
        tweet = " ".join([lemmatizer.lemmatize(word) for word in tweet.split(" ")])
    else:
        tweet=''
    return tweet

def extractEmoticons(tweet):
    # emoji = emoji_pattern.findall(tweet)
    emoji = []
    for emo in emoji_list:
        if emo in tweet:
            emoji.append(emo)
    
    # these are :) :-) and other stuff
    emoticons = re.findall(reg, tweet)
    return " , ".join(emoji + emoticons)
def removeEmoticons(tweet):
    return re.sub(reg,'',tweet)

# data = data.dropna()
data['processed_text'] = data.text.apply(preprocess)

#getting the emoticons from the cleaned data
data['emoticons'] = data['processed_text'].apply(extractEmoticons)

# Removing emoticons from the text data
data['processed_text'] = data['processed_text'].apply(removeEmoticons)

In [8]:
vectorizer = TfidfVectorizer(stop_words=stop_list)
X = vectorizer.fit_transform(data.processed_text)

In [9]:
km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(3):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, -10:]:
        print(' %s' % terms[ind])
    print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=3, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=True)
Initialization complete
Iteration  0, inertia 26488.945
Iteration  1, inertia 13590.235
Iteration  2, inertia 13573.981
Iteration  3, inertia 13558.029
Iteration  4, inertia 13547.756
Iteration  5, inertia 13539.660
Iteration  6, inertia 13538.935
Iteration  7, inertia 13538.329
Iteration  8, inertia 13538.170
Iteration  9, inertia 13538.093
Iteration 10, inertia 13538.071
Iteration 11, inertia 13538.060
Iteration 12, inertia 13538.049
Iteration 13, inertia 13538.031
Iteration 14, inertia 13538.024
Iteration 15, inertia 13538.015
Iteration 16, inertia 13537.991
Iteration 17, inertia 13537.987
Iteration 18, inertia 13537.980
Iteration 19, inertia 13537.969
Iteration 20, inertia 13537.963
Iteration 21, inertia 13537.953
Iteration 22, inertia 13537.946
Iteration 23, inertia 13537.920
Iteration 24

In [67]:
data[data['emoticons'] != '']

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone,processed_text,emoticons
61,62,No candidate mentioned,1.0000,yes,1.0000,Negative,1.0000,FOX News or Moderators,1.0000,,...,,,RT @AmyMek: Status 👉 Single \n\nI broke up wi...,,2015-08-07 09:54:23 -0700,629697105838383104,"Florida,, usually",Eastern Time (US & Canada),status 👉 single \n\ni broke last night! g...,👉
125,126,Donald Trump,1.0000,yes,1.0000,Negative,0.6549,FOX News or Moderators,1.0000,,...,,,RT @AmyMek: We all owe @realDonaldTrump a huge...,,2015-08-07 09:53:53 -0700,629696979858292736,,Pacific Time (US & Canada),owe huge thank 4 exposing world👉 chris wall...,👉
140,141,Donald Trump,0.3921,yes,0.6262,Negative,0.6262,None of the above,0.3921,,...,,,"RT @VH1: Honestly, Donald Trump you were ALL t...",,2015-08-07 09:53:47 -0700,629696952985350144,,Cape Verde Is.,"honestly, donald trump type petty last night'...",😒
143,144,No candidate mentioned,0.4510,yes,0.6716,Positive,0.3372,FOX News or Moderators,0.2264,,...,,,RT @peddoc63: Go Carly📢\nGo Carly📢\nGo Carly...,,2015-08-07 09:53:46 -0700,629696947918643200,,,go carly📢\ngo carly📢\ngo carly📢\n carly20...,📢
148,149,Donald Trump,1.0000,yes,1.0000,Negative,0.3385,None of the above,1.0000,,...,,,RT @DrMartyFox: #Trump Wins Drudge \nDebate Po...,,2015-08-07 09:53:44 -0700,629696941769625600,SSCA,Arizona,trump win drudge \ndebate poll \n\n➡️ landsl...,"➡️ , ☑️ , 🇺🇸"
160,161,Donald Trump,0.4347,yes,0.6593,Neutral,0.3626,None of the above,0.4347,,...,,,RT @STEEL5757: 🎀 #DonaldTrump Plaza Casino #T...,,2015-08-07 09:53:39 -0700,629696921444028417,,,🎀 donaldtrump plaza casino teddybear plush...,🎀
184,185,No candidate mentioned,1.0000,yes,1.0000,Negative,1.0000,FOX News or Moderators,0.6629,,...,,,RT @renomarky: ☑ never watch #KellyFile again\...,,2015-08-07 09:53:26 -0700,629696865982902272,,Central Time (US & Canada),☑ never watch kellyfile again\n\n1. gopdeba...,🇺🇸
201,202,No candidate mentioned,1.0000,yes,1.0000,Negative,1.0000,FOX News or Moderators,1.0000,,...,,,"RT @realDonaldTrump: ""@FrankLuntz: I'm getting...",,2015-08-07 09:53:18 -0700,629696830893244416,Earth: Senseless nonsense,Pacific Time (US & Canada),""" i'm getting lot hatemail tonight. 😆 gopd...",😆
365,366,No candidate mentioned,0.3940,yes,0.6277,Neutral,0.6277,None of the above,0.3940,,...,,,Speed Dating For A New President? Interesting ...,,2015-08-07 09:52:03 -0700,629696514932084736,"Las Vegas, NV",Pacific Time (US & Canada),speed dating new president? interesting articl...,:-)
496,497,No candidate mentioned,0.4074,yes,0.6383,Positive,0.3404,None of the above,0.4074,,...,,,Kind of glad I didn't get to watch the #GOPDeb...,,2015-08-07 09:50:52 -0700,629696217660784640,seattle,Pacific Time (US & Canada),kind glad didn't get watch gopdebate yesterda...,😊


In [5]:
emoticons_single = data['emoticons'].copy().str.split(' , ').apply(pd.Series, 1).stack()
emoticons_single.index = emoticons_single.index.droplevel(-1)

emoticons_single.name = 'seggregated_emoticons'
data = data.join(emoticons_single.str.strip())

In [10]:
pd.Series(data[data['seggregated_emoticons'] != ''].seggregated_emoticons).shape

(203,)

In [8]:
data.shape

(45001, 13)

In [9]:
hashtags = data['hashtags'].copy().str.split(' ').apply(pd.Series, 1).stack()
hashtags.index = hashtags.index.droplevel(-1)
pd.to_csv(pd.DataFrame(pd.Series(data.hashtags.unique())), 'hashtags.csv')

Unnamed: 0,0
0,
1,#AmericaFirst #dtmag
2,#BasementDwellers
3,#ncpol
4,#ImWithHer
5,#TwitterAttackTrump
6,#FreeAdvertisements
7,#TrumpPence2016 #MAGA
8,#via
9,#Trump
