In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
df = pd.read_csv('sentiment.tsv', delimiter = '\t', header=None,names = ["Sentiment", "SentimentText"])
yes_no_cols = ["Sentiment"]
df[yes_no_cols] = df[yes_no_cols] == 'pos'
df = df[df['SentimentText'].isnull() == False]
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.head(5)

Unnamed: 0,Sentiment,SentimentText
0,False,"@jamielewislewis i cant believe it, it really ..."
1,True,having a vodka tonic and looking forward to go...
2,True,@ddlovatofans1neg1 Could you follow me please....
3,True,@jordanknight for once.................. PLEAS...
4,False,Had a dream about a walk in fast food resturau...


In [2]:
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

def tokenize(tweet):    
    tweet = (tweet.lower())
    tokens = tokenizer.tokenize(tweet)
    tokens = filter(lambda t: not t.startswith('@'), tokens)
    tokens = filter(lambda t: not t.startswith('#'), tokens)
    tokens = filter(lambda t: not t.startswith('http'), tokens)
    tokens = list(tokens)
    return tokens

In [4]:
def postprocess(data, n=1000000):    
    data = data.head(n)
    
    data['tokens'] = data['SentimentText'].map(tokenize)  
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(df)

In [5]:
data.head()

Unnamed: 0,Sentiment,SentimentText,tokens
0,False,"@jamielewislewis i cant believe it, it really ...","[i, cant, believe, it, ,, it, really, doesnt, ..."
1,True,having a vodka tonic and looking forward to go...,"[having, a, vodka, tonic, and, looking, forwar..."
2,True,@ddlovatofans1neg1 Could you follow me please....,"[could, you, follow, me, please, ., i, would, ..."
3,True,@jordanknight for once.................. PLEAS...,"[for, once, ..., please, tell, us, why, u, wer..."
4,False,Had a dream about a walk in fast food resturau...,"[had, a, dream, about, a, walk, in, fast, food..."


In [6]:
n=1000000
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens),
                                                    np.array(data.head(n).Sentiment), test_size=0.2)

In [7]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in (enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

In [8]:
x_train[0]

LabeledSentence(words=['im', 'so', 'jealous'], tags=['TRAIN_0'])

In [9]:
n_dim = 1500
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in (x_train)])
tweet_w2v.train([x.words for x in (x_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)

46938

In [10]:
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 310


In [11]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [12]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in (map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in (map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

In [13]:
from sklearn.metrics import accuracy_score as accuracy
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# from sklearn.naive_bayes import GaussianNB
# from sklearn.linear_model import SGDClassifier
# from sklearn import linear_model
# from sklearn import tree
# from sklearn import svm
# from sklearn import ensemble
# from sklearn import neighbors

X = train_vecs_w2v
y = y_train 

kf = KFold(len(y),n_folds=10,shuffle=True)
kf2 = StratifiedKFold(y,n_folds=10,shuffle=True)

results = cross_val_score(LogisticRegression(), X = X, y = y, scoring = "roc_auc", cv = kf)
print("LogisticRegression Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))




LogisticRegression Accuracy: 0.737 (0.039)


In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, scoring = "roc_auc", cv=10)
grid.fit(X, y)
print("Best cross-validation score: ", grid.best_score_)
print("Best parameters: ", grid.best_params_)

Best cross-validation score:  0.734933316527
Best parameters:  {'C': 1}


In [15]:
# from __future__ import print_function
import keras.callbacks as cb
from keras.layers.core import Activation, Dense, Dropout
from keras.models import Sequential

from keras.regularizers import l1, l2
from keras.utils import np_utils

%matplotlib inline
from matplotlib import pyplot as plt
import time

Using TensorFlow backend.


In [16]:
from keras.optimizers import SGD
from keras.optimizers import RMSprop
from keras import metrics

def DefineModel1():    
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=1500))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])    
    return model

In [17]:
# model = DefineModel1()     
# model.fit(train_vecs_w2v, y_train, epochs=50, batch_size=32, verbose=2)

In [18]:
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score as accuracy
from keras.wrappers.scikit_learn import KerasClassifier
import numpy

# create model
model = KerasClassifier(build_fn=DefineModel1, epochs=50, batch_size=32, verbose=0)
# evaluate using 10-fold cross validation
# kf = KFold(len(y),n_folds=10,shuffle=True)
kf2 = StratifiedKFold(y,n_folds=10,shuffle=True, random_state=7)
results = cross_val_score(model, X, y,scoring = "roc_auc", cv=kf2)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

Accuracy: 0.578 (0.038)


# Summary of roc_auc:
KerasClassifier: Accuracy: 0.578 (0.038)
LogisticRegression Accuracy: 0.737 (0.039)