In [2]:
import pandas as pd 
import numpy as np
import string, re
import nltk
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

from time import time

%matplotlib inline



In [3]:
data = pd.read_csv('Data/tweets.txt', sep = ';~;', engine='python')
emoji_list = pd.read_csv('Data/emoji_table.txt', encoding='utf-8', index_col=0).index.values
SentimentEmoji = pd.read_csv('Data/Emoji_classification.csv', encoding='utf-8').dropna()
SentimentHashtags = pd.read_csv('Data/hashtags.csv', encoding='utf-8').dropna()
hillaryTest = pd.read_csv('Hillary.csv')
data.head(5)

Unnamed: 0,username,date,retweets,favorites,text,geo,mentions,hashtags,id,permalink
0,mitchellvii,2016-10-01 15:51,878,1216,"""Hillary attacked Trump for (allegedly) callin...",,,,"""782352194473459713""",https://twitter.com/mitchellvii/status/7823521...
1,noahsnab,2016-10-01 06:33,10938,20233,"""me: i hate michigan :(( detroit: polls zero p...",,,,"""782211911723151360""",https://twitter.com/noahsnab/status/7822119117...
2,dt_ads,2016-10-01 12:20,29,31,"""If you feel US workers should have jobs befor...",,,#AmericaFirst #dtmag,"""782299159571431424""",https://twitter.com/dt_ads/status/782299159571...
3,MikePenceVP,2016-10-01 13:14,471,415,"""Hillary called Trump Supporters ""deplorable"" ...",,,#BasementDwellers,"""782312789968814080""",https://twitter.com/MikePenceVP/status/7823127...
4,JuddLegum,2016-10-01 08:18,641,1603,"""16. Trump claimed Google was involved in a co...",,,,"""782238361046036480""",https://twitter.com/JuddLegum/status/782238361...


In [4]:
# List of positive and negative tweets
sad = [':‑(', ':(', ':‑c', ':c', ':‑<', ':<', ':‑[' ,':[', ':-||', '>:[', ':{', ':@', '>:(']
Positive = [':‑)',':)', ':-]', ':]',':-3', ':3', ':->', ':>' ,'8-)', '8)',':-}', ':}', ':o)', ':c)', ':^)' ,'=]', '=)'
           ,':‑D', ':D', '8‑D', '8D', 'x‑D', 'xD', 'X‑D', 'XD', '=D', '=3', 'B^D']
SentimentHashtags['HashtagSentiment'] = SentimentHashtags['HashtagSentiment'].map({'Positive':1, 'Negative':-1})
SentimentEmoji['Sentiment'] = SentimentEmoji['Sentiment'].map({'Positive':1, 'Negative':-1, 'Neutral':0}).dropna()
SentimentHashtags['Directed'] = SentimentHashtags['Directed'].map({'T':1, 'H':0})
hillaryTest.Sentiment = hillaryTest.Sentiment.map({'Positive':1, 'Negative':-1, 'Neutral':0})

In [6]:
# removing for word2vec
stop_list = nltk.corpus.stopwords.words('english')
stop_list = stop_list + ["rt"] # Letting this remain so that rt are removed from the tweets

lemmatizer = nltk.stem.WordNetLemmatizer()
# rt - stands for retweet


# regex for capturing tweets
reg = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
emoticons = "|".join(map(re.escape, sad + Positive))

emoji_pattern = re.compile(u'('
    u'\ud83c[\udf00-\udfff]|'
    u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
    u'[\u2600-\u26FF\u2700-\u27BF])+', 
    re.UNICODE)
classifier =[]
# URL_Pat = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
def preprocess(tweet):
    # only processing if the the value is a string
    if type(tweet)!=type(2.0):
        tweet = tweet.decode('latin-1').encode("utf-8").decode('utf-8').strip()
        tweet = tweet.lower()
        # Removing hashtags
        tweet = " ".join(tweet.split('#'))
        # Removing URLs
        tweet = re.sub('((www\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub('((pic\.[^\s]+)|(https://[^\s]+))','',tweet)
        tweet = re.sub(u'[a-zA-Z0-9./]+\.[a-zA-Z0-9./ ]+.*$','',tweet)
        tweet = re.sub("(http\S+)|(https\S+)", '', tweet)
        # Removing User mentions
        tweet = re.sub('@[^\s]+','',tweet)
        tweet = tweet.strip('\'"')
        # Removing stop words - This can be moved to count vectorization
        # tweet  = " ".join([word for word in tweet.split(" ") if word not in stop_list])
        # lemmatizing words 
        tweet = " ".join([lemmatizer.lemmatize(word) for word in tweet.split(" ")])
    else:
        tweet=''
    return tweet

def extractEmoticons(tweet):
    # emoji = emoji_pattern.findall(tweet)
    emoji = []
    for emo in emoji_list:
        if emo in tweet:
            emoji.append(emo)
    
    # these are :) :-) and other stuff
    emoticons = re.findall(reg, tweet)
    return " , ".join(emoji + emoticons)
def removeEmoticons(tweet):
    return re.sub(reg,'',tweet)

# data = data.dropna()
data['processed_text'] = data.text.apply(preprocess)
hillaryTest['processed_text'] = hillaryTest.processed_text.apply(preprocess)

#getting the emoticons from the cleaned data
data['emoticons'] = data['processed_text'].apply(extractEmoticons)

# Removing emoticons from the text data
data['processed_text'] = data['processed_text'].apply(removeEmoticons)
print 'Completed'

Completed


In [8]:
HillaryTweets = data[data['processed_text'].str.contains('((hil.?ary)|(clinton))', case = False)]
DonaldTweets = data[data['processed_text'].str.contains('trump', case = False)]

datasets = [HillaryTweets.copy(), DonaldTweets.copy()]
TrainSets = []
for i, dataset in enumerate(datasets):
    hashtags = datasets[i]['hashtags'].copy().str.split(' ').apply(pd.Series, 1).stack()
    hashtags.index = hashtags.index.droplevel(-1)
    datasets[i].drop('hashtags', axis=1, inplace=True)
    hashtags.name = 'hashtags'
    
    datasets[i] = datasets[i].join(hashtags.str.strip())
    
    emoticons = datasets[i]['emoticons'].copy().str.split(' ').apply(pd.Series, 1).stack()
    emoticons.index = emoticons.index.droplevel(-1)
    datasets[i].drop('emoticons', axis=1, inplace=True)
    emoticons.name = 'emoticons'
    datasets[i] = datasets[i].join(emoticons.str.strip())
    
    Directed_hashtags = SentimentHashtags[SentimentHashtags['Directed'] == 0]
    Opp_hashtags = SentimentHashtags[SentimentHashtags['Directed'] != 0]
    Opp_hashtags.loc[: ,'HashtagSentiment'] = Opp_hashtags.HashtagSentiment * -1;
    
    Directed_hashtags = Directed_hashtags.append(Opp_hashtags)
    datasets[i] = pd.merge(datasets[i], Directed_hashtags, on = 'hashtags', how='outer')
    datasets[i] = pd.merge(datasets[i], SentimentEmoji, on = 'emoticons', how='outer')
    datasets[i]['Sentiment'] = datasets[i]['HashtagSentiment'].add(datasets[i]['Sentiment'], fill_value = 0)
    TrainSets.append(datasets[i][['username', 'date', 'processed_text', 'Sentiment']].dropna().groupby(['processed_text', 'Sentiment']).max().reset_index())

data_train = datasets[0][['processed_text','Sentiment']].copy().dropna()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
vectorizer = CountVectorizer(stop_words=stop_list)
#vectorizer = HashingVectorizer(stop_words=stop_list)
#vectorizer = TfidfVectorizer(stop_words=stop_list)

X = vectorizer.fit_transform(data_train.processed_text.append(hillaryTest.processed_text))
X_train = X[0:data_train.processed_text.shape[0]]
Y_train = data_train['Sentiment']
#X_test = vectorizer.fit(hillaryTest.processed_text)

#model = RandomForestClassifier()
#model.fit(X_train, Y_train)

#preds = model.predict(X_test)

In [97]:
score = 0
tot = 0
for i, pred in enumerate(preds):
    if(hillaryTest.Sentiment[i] == hillaryTest.Sentiment[i]):
        tot+=1
        if(hillaryTest.Sentiment[i] == pred):
            score+=1
print score, tot

41 72


In [9]:
km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(3):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, -10:]:
        print(' %s' % terms[ind])
    print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=3, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=True)
Initialization complete
Iteration  0, inertia 26488.945
Iteration  1, inertia 13590.235
Iteration  2, inertia 13573.981
Iteration  3, inertia 13558.029
Iteration  4, inertia 13547.756
Iteration  5, inertia 13539.660
Iteration  6, inertia 13538.935
Iteration  7, inertia 13538.329
Iteration  8, inertia 13538.170
Iteration  9, inertia 13538.093
Iteration 10, inertia 13538.071
Iteration 11, inertia 13538.060
Iteration 12, inertia 13538.049
Iteration 13, inertia 13538.031
Iteration 14, inertia 13538.024
Iteration 15, inertia 13538.015
Iteration 16, inertia 13537.991
Iteration 17, inertia 13537.987
Iteration 18, inertia 13537.980
Iteration 19, inertia 13537.969
Iteration 20, inertia 13537.963
Iteration 21, inertia 13537.953
Iteration 22, inertia 13537.946
Iteration 23, inertia 13537.920
Iteration 24

In [69]:
import logging
from gensim.models import word2vec

def get_words(tweet):
    return tweet.split(' ')
tweets = pd.Series(data['processed_text'].unique()).apply(get_words)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 140    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)

print "Training model..."
model = word2vec.Word2Vec(tweets, workers=num_workers, size=num_features, min_count = min_word_count, window = context,
                          sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "30features_40minwords_10context"
model.save(model_name)

Training model...
