In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import HTMLParser as htm
import string
import re

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *



## Read in data

In [2]:
data = pd.read_csv("tweet_data_1.csv",sep='\t',quoting=3)
data["escape"] = data.apply(lambda row: htm.HTMLParser().unescape(row[1].decode("utf-8")),axis=1)
#data["escape"] = data.apply(lambda row: row[3]*5)
 
data.head()

Unnamed: 0,Id,Tweet,Emotion,Positive,escape
0,138881940341260288:,I got a surprise for all you bitches...pull th...,:: surprise,0,I got a surprise for all you bitches...pull th...
1,144479819843911683:,If I was a thief.. The first thing I would ste...,:: joy,1,If I was a thief.. The first thing I would ste...
2,139110849120972800:,"""&quot;@RevRunWisdom: not afraid of tomorrow, ...",:: fear,0,"""""@RevRunWisdom: not afraid of tomorrow, for I..."
3,141532076791971840:,"""Extreme can neither fight nor fly.&#xA;-- Wil...",:: fear,0,"""Extreme can neither fight nor fly.\n-- Willia..."
4,145353048817012736:,Thinks that @melbahughes had a great 50th birt...,:: surprise,0,Thinks that @melbahughes had a great 50th birt...


## Manually preprocess data

In [3]:
def process_data(data):
    """Converts to lowercase, strips out punctuation,
    removes excess whitespace within a string & leading & trailing whitespace"""
    new_list = []
    table = string.maketrans("","")
    for elem in data:
        elem = "".join(i for i in elem if ord(i)<128)
        elem = str(elem)        
        elem = elem.lower()
        elem = elem.translate(table, string.punctuation)
        
        # Comment these 2 lines out to improve positive
        elem = re.sub(' +',' ', elem)
        elem = elem.strip()
        
        new_list.append(elem)
    return new_list

#train_pol_x = process_data(train_pol_x)
#test_pol_x = process_data(test_pol_x)



## Split data into train & test sets

In [4]:
#process entire data set
data.escape = process_data(data.escape)

In [5]:
# Train and test data frames
train, test = train_test_split(data, test_size = 0.2)

# Train and test target labels
train_pol_y = train.ix[:,3].tolist()
test_pol_y = test.ix[:,3].tolist()

train_pol_x2 = train.ix[:,3:5]


#binarize emo labels
from sklearn import preprocessing
train_emo = train.ix[:,2].tolist()
test_emo = test.ix[:,2].tolist()

emo_bin = preprocessing.LabelBinarizer()

train_emo_y = emo_bin.fit_transform(train_emo)
tests_emo_y = emo_bin.transform(test_emo)
# To get emotionas use emo_bin.inverse_transform(tests_emo_y)

# MC NEW CODE
# Train and test x
train_pol_x = train.ix[:, 4].tolist()
test_pol_x = test.ix[:, 4].tolist()


## Exploratory on entire dataset

In [6]:
print "Dataset size: %s \n" % len(data)

print "Polarity counts"
print data.Positive.value_counts()
print "\n"
#avg number of words

print "Avg Words %s \n" % np.mean([len(s.split(" ")) for s in data.escape])
print "Emotion Counts"
print data.Emotion.value_counts()

Dataset size: 21051 

Polarity counts
0    12811
1     8240
Name: Positive, dtype: int64


Avg Words 15.0530141086 

Emotion Counts
:: joy         8240
:: surprise    3849
:: sadness     3830
:: fear        2816
:: anger       1555
:: disgust      761
Name: Emotion, dtype: int64


In [7]:
#top words
from collections import Counter
holder = Counter()

#process entire dataset
data_ = data.ix[:, 4].tolist()
process_data(data_)

for i in data_:
    for word in i.split(" "):
        holder[word] += 1

In [8]:
from nltk.corpus import stopwords
for i in holder.most_common(10):
    #if i[0].lower() not in stopwords.words("english"):
    print "%s \t %s" % (i[0],i[1])

the 	 9098
i 	 8867
to 	 8101
a 	 6230
my 	 5487
and 	 5395
of 	 4776
in 	 4169
is 	 3780
for 	 3539


## Model with preprocessed data

### With one Classifier

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

#with one model only
vectorizer = TfidfVectorizer(min_df=2,
                             use_idf=True,
                            stop_words='english',
                            )
train_vectors = vectorizer.fit_transform(train_pol_x)
test_vectors = vectorizer.transform(test_pol_x)

base1 = svm.SVC(kernel='linear')
base1.fit(train_vectors, train_emo)
predict_base1 = base1.predict(test_vectors)

print classification_report(test_emo,predict_base1)

             precision    recall  f1-score   support

   :: anger       0.55      0.29      0.38       320
 :: disgust       0.44      0.08      0.13       151
    :: fear       0.73      0.50      0.60       582
     :: joy       0.59      0.84      0.69      1639
 :: sadness       0.43      0.39      0.41       742
:: surprise       0.57      0.46      0.51       777

avg / total       0.57      0.58      0.55      4211



### With 2 Classifiers

In [12]:
#Classifier 1

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

#min_df_options = [1, 2, 5, 10, 25]

#for df in min_df_options:
#print "Minimum DF", df
vectorizer = TfidfVectorizer(min_df=2,
                             use_idf=True,
                            stop_words='english',
                            )
train_vectors = vectorizer.fit_transform(train_pol_x)
test_vectors = vectorizer.transform(test_pol_x)

base1 = svm.SVC(kernel='linear')
base1.fit(train_vectors, train_pol_y)
predict_base1 = base1.predict(test_vectors)

target_names = ["Negative","Positive"]
print classification_report(test_pol_y,predict_base1, target_names = target_names)

             precision    recall  f1-score   support

   Negative       0.78      0.85      0.81      2572
   Positive       0.72      0.63      0.67      1639

avg / total       0.76      0.76      0.76      4211



In [13]:
test_df = pd.DataFrame({'Positive': predict_base1.tolist() , 'escape': test_pol_x})

In [18]:
train_pol_x2 = train.ix[:,3:5]
from sklearn_pandas import DataFrameMapper
mapper = DataFrameMapper([
        (["Positive"],None),
    ("escape",TfidfVectorizer(min_df=2,use_idf=True,stop_words='english'))])

train_m2 = mapper.fit_transform(train_pol_x2,train_emo)
test_m2 = mapper.transform(test_df)


In [19]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()

log.fit(train_m2,train_emo)
preds = log.predict(test_m2)

print classification_report(test_emo,preds)

             precision    recall  f1-score   support

   :: anger       0.57      0.25      0.35       320
 :: disgust       0.58      0.05      0.09       151
    :: fear       0.66      0.53      0.59       582
     :: joy       0.72      0.63      0.67      1639
 :: sadness       0.36      0.57      0.44       742
:: surprise       0.46      0.58      0.51       777

avg / total       0.58      0.55      0.55      4211



In [None]:
#base2 = svm.SVC(kernel='linear')
#base2.fit(train_m2,train_emo)
#predict_base2 = base2.predict(test_m2)

In [None]:
#print classification_report(test_emo,predict_base2)

## Model without preprocessed data

### Try different hyperparams

In [9]:
#https://marcobonzanini.com/2015/01/19/sentiment-analysis-with-python-and-scikit-learn/
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

min_df_options = [1, 2, 5]
strip_accents_options = ['ascii', 'unicode']
lowercase_options = [True, False]

for elem in strip_accents_options:
    print "ACCENT:", elem
    for op in lowercase_options:
        print "LOWERCASE OPTION:", str(op)
        for df in min_df_options:
            print "Minimum DF:", df
            vectorizer = TfidfVectorizer(min_df=df,
                                         use_idf=True,
                                        stop_words='english',
                                         strip_accents=elem,
                                         lowercase=op
                                        )
            train_vectors = vectorizer.fit_transform(train.ix[:,4].tolist())
            test_vectors = vectorizer.transform(test.ix[:,4].tolist())

            base1 = svm.SVC(kernel='linear')
            base1.fit(train_vectors, train_pol_y)
            predict_base1 = base1.predict(test_vectors)

            target_names = ["Negative","Positive"]
            print classification_report(test_pol_y,predict_base1, target_names = target_names)


ACCENT: ascii
LOWERCASE OPTION: True
Minimum DF: 1
             precision    recall  f1-score   support

   Negative       0.78      0.86      0.82      2538
   Positive       0.75      0.64      0.69      1673

avg / total       0.77      0.77      0.77      4211

Minimum DF: 2
             precision    recall  f1-score   support

   Negative       0.78      0.86      0.82      2538
   Positive       0.75      0.63      0.69      1673

avg / total       0.77      0.77      0.76      4211

Minimum DF: 5
             precision    recall  f1-score   support

   Negative       0.77      0.86      0.81      2538
   Positive       0.74      0.62      0.68      1673

avg / total       0.76      0.76      0.76      4211

LOWERCASE OPTION: False
Minimum DF: 1
             precision    recall  f1-score   support

   Negative       0.79      0.85      0.82      2538
   Positive       0.75      0.66      0.70      1673

avg / total       0.77      0.77      0.77      4211

Minimum DF: 2
         

KeyboardInterrupt: 

In [None]:
vectorizer = TfidfVectorizer(min_df=2,
                             use_idf=True,
                             stop_words='english')
train_vectors = vectorizer.fit_transform(train.ix[:,4].tolist())
test_vectors = vectorizer.transform(test.ix[:,4].tolist())

base1 = svm.SVC(kernel='linear')
base1.fit(train_vectors, train_pol_y)
predict_base1 = base1.predict(test_vectors)

target_names = ["Negative","Positive"]
print classification_report(test_pol_y,predict_base1, target_names = target_names)

### Try LinearSVC

In [None]:
vectorizer = TfidfVectorizer(min_df=2,
                             use_idf=True,
                             stop_words='english')
train_vectors = vectorizer.fit_transform(train.ix[:,4].tolist())
test_vectors = vectorizer.transform(test.ix[:,4].tolist())

base1 = svm.LinearSVC()

C_options = {'C': np.arange(0.1, 1, 0.1)}
grid = GridSearchCV(base1, C_options)

grid.fit(train_vectors, train_pol_y)
preds = grid.predict(test_vectors)
    
# Output best param
print "Best value for C: %.2f" %grid.best_params_['C']
print "F1 score for Logistic Regression: %.3f" %metrics.f1_score(test_pol_y, preds, average="weighted") + "\n"


target_names = ["Negative","Positive"]
print classification_report(test_pol_y, preds, target_names = target_names)

## Try other models

In [None]:
def P3(train_data, train_labels, dev_data, dev_labels):
### STUDENT START ###

    # Data setup
    vec = TfidfVectorizer(min_df=2,
                             use_idf=True,
                             stop_words='english')
    train_mat = vec.fit_transform(train_data)    
    dev_fit = vec.transform(dev_data)
    
    # K-NN MODEL
#     knn = KNeighborsClassifier()
    
    # Use GridSearchCV to find optimal k value
#     k_options = {'n_neighbors':[i for i in range(1, 3)]}
#     knn_grid = GridSearchCV(knn, k_options)

#     knn_grid.fit(train_mat, train_labels)
#     preds = knn_grid.predict(dev_fit)
    
    # Output best param
#     print "Best value for k: %d" %knn_grid.best_params_['n_neighbors']
#     print "F1 score for K-NN: %.3f" %metrics.f1_score(dev_labels, preds, average="weighted") + "\n"
    
    # MULTINOMIAL NAIVE BAYES
    # Repeat process for multinomial Naive Bayes
    mul = MultinomialNB(alpha=0.5)
    
    alpha_options = {'alpha': np.arange(0.01, 1, 0.01)}
    mul_grid = GridSearchCV(mul, alpha_options)
    
    mul_grid.fit(train_mat, train_labels)
    mul_preds = mul_grid.predict(dev_fit)
    
    # Output best param
    print "Best value for alpha: %.2f" %mul_grid.best_params_['alpha']
    print "F1 score for Multinomial Naive Bayes: %.3f" %metrics.f1_score(dev_labels, mul_preds, average="weighted") + "\n"

    
    # LOGISTIC REGRESSION
    # Repeat process for multinomial Naive Bayes
    log = LogisticRegression()
#     log = LogisticRegression(class_weight='balanced')
    
    C_options = {'C': np.arange(0.1, 1, 0.1)}
    log_grid = GridSearchCV(log, C_options)

    log_grid.fit(train_mat, train_labels)
    log_preds = log_grid.predict(dev_fit)
    
    # Output best param
    print "Best value for C: %.2f" %log_grid.best_params_['C']
    print "F1 score for Logistic Regression: %.3f" %metrics.f1_score(dev_labels, log_preds, average="weighted") + "\n"
    
    # Output sum of squared weights for a series of values of C
#     C_vals = np.arange(0.1, 1, 0.1)
#     for val in C_vals:
#         log_mod = LogisticRegression(C=val)
#         log_mod.fit(train_mat, train_labels)
#         print "Sum of squared weights for C=%.2f:" %val, np.square(log_mod.coef_).sum(axis=1)
    

### STUDENT END ###

P3(train_pol_x, train_pol_y, test_pol_x, test_pol_y)