In [None]:
#Import libraries
import numpy as np
import nltk
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics

sb.set(color_codes=True)
nltk.download('wordnet')

In [None]:
#Load the data
data_train = pd.read_csv("./train.tsv", sep="\t")
data_test = pd.read_csv("./test.tsv", sep='\t')
data_train

In [None]:
#Basic Visualisation of the data
dist = data_train.groupby(["Sentiment"]).size()
dist = dist / dist.sum()
plt.subplots(figsize=(12,8))
sb.barplot(dist.keys(), dist.values);

In [None]:
#Customized Tokenizer for data preprocessing
def NormalizingTokenizer(doc):
    #Tokenize
    phrase = nltk.word_tokenize(doc)
    #Lemmatize
    phrase_lemma = []
    lemmatizer = nltk.stem.WordNetLemmatizer()
    for word in phrase:
        phrase_lemma.append(lemmatizer.lemmatize(word))
    #Handle negations
    phrase_iterator = iter(phrase_lemma)
    negation = False
    phrase_negated = []
    for word in phrase_iterator:
        phrase_negated.append(word)
        if word ==  'not':
            negation = True
            break 
    for word in phrase_iterator:
            phrase_negated.append('not_' + word) 
    return phrase_negated

In [None]:
#Building Pipelines

#Naive Bayes Classifier
bayes_clf = Pipeline([
    ('vect', CountVectorizer(tokenizer=NormalizingTokenizer)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
#Support Vector Machine Classifier
svm_clf = Pipeline([
    ('vect', CountVectorizer(tokenizer=NormalizingTokenizer)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3)),
])

In [None]:
#We will be doing parameter tuning using grid search
import normalize 
#In order to train the model on multiple cores, all functions need to be 'picklable'
#Because of this we have to import the NormalizingTokenizer function

#Naive Bayes Classifier pipeline with imported tokenizer
bayes_clf = Pipeline([
    ('vect', CountVectorizer(tokenizer=normalize.NormalizingTokenizer)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
#Support Vector Machine Classifier pipeline with imported tokenizer
svm_clf = Pipeline([
    ('vect', CountVectorizer(tokenizer=normalize.NormalizingTokenizer)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3)),
])

#Parameters range for the NB Model:
#Exlore the models for mono-, bi-, and tri-grams
#Explore with or without inverse document frequency
#Explore with alpha= 0.5 or 1.0
bayes_parameters = {
     'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
     'tfidf__use_idf': (True, False),     
     'clf__alpha': (0.5, 1.0),
}

#Parameters range for the SVM Model:
#Exlore the models for mono-, bi-, and tri-grams
#Explore with or without inverse document frequency
#Explore with alpha= 0.01 or 0.001
svm_parameters = {
     'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
     'tfidf__use_idf': (True, False),     
     'clf__alpha': (1e-2, 1e-3),
}

#Split the training set
X_train, X_validation, Y_train, Y_validation = train_test_split(data_train.values[:,2], 
                                                                data_train.values[:,-1].astype(int),
                                                                test_size=0.1)

In [None]:
#Parameter search on NB Classifier for the first 10000 datapoints
gs_bayes = GridSearchCV(bayes_clf, bayes_parameters, cv=5, iid=False, n_jobs=-1, verbose=10)
gs_bayes.fit(X_train[:10000], Y_train[:10000])
#Fitting the model to the entire training dataset
bayes_clf = gs_bayes.best_estimator_
bayes_clf.fit(X_train, Y_train)
#Testing the model
predicted = bayes_clf.predict(X_validation)
print('NB Classifier:')
print(metrics.classification_report(Y_validation, predicted))

In [None]:
#Parameter search on SVM Classifier
gs_svm= GridSearchCV(svm_clf, svm_parameters, cv=5, iid=False, n_jobs=-1, verbose=10)
gs_svm.fit(X_train[:10000], Y_train[:10000])
#Fitting the model to the entire training dataset
svm_clf = gs_svm.best_estimator_
svm_clf.fit(X_train, Y_train)
#Testing the model
predicted = svm_clf.predict(X_validation)
print('SVM Classifier:')
print(metrics.classification_report(Y_validation, predicted))

In [None]:
X_test = data_test.values[:,-1]
Y_test = bayes_clf.predict(X_test)
phrase_ids = data_test.values[:,0]
submission = pd.DataFrame(np.column_stack((phrase_ids, Y_test)), columns=['PhraseId','Sentiment'])
submission.to_csv('submission.csv', index=False)