In [1]:
# import machine learning libraries
# Data analysis
import pandas as pd
import numpy as np
#data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# Machine Learning
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
# natural language toolkit
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.


In [2]:
# import train and test data
train = pd.read_csv("all/labeledTrainData.tsv", header = 0, delimiter = '\t') # give full path of the train data
test = pd.read_csv("all/testData.tsv", header = 0, delimiter = '\t') # give full path of the test data

In [4]:
# information about the train dataframe 
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [5]:
# information about the test dataframe
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id        25000 non-null object
review    25000 non-null object
dtypes: object(2)
memory usage: 390.7+ KB


In [6]:
# train Dataset top five rows
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [7]:
# test dataset top five rows
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [8]:
# Drop all those rows which have null values
train = train.dropna()
test = test.dropna()

In [9]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [10]:
# divide the train data into training and validation dataset
xtrain, xvalid, ytrain, yvalid = train_test_split(train['review'].values, train['sentiment'], 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)
# test data contain only review column
xtest = test['review']
#ytest= test['sentiment']

In [11]:
# print shape of train, validation and test dataset
print (xtrain.shape)
print (xvalid.shape)
print(xtest.shape)

(20000,)
(5000,)
(25000,)


In [16]:
len(xtrain[0])

1306

In [12]:
# create submission file , take test id as first column 
# we will submitt this file in kaggle problem
submission = test['id']
submission = pd.DataFrame(submission)
submission
type(submission)

pandas.core.frame.DataFrame

We will use two types of features
1. word count as feautres
2. word count + Tfidf as features

In [13]:
# word count as features
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and validation dataset.
# first find all unique words and form a dictionary and then occurence of each word of dictionary is the given data is converted
#data ,dictionary = [w1, w2, w3,w4], text = [w1,w3,w1]==>[2,0,1,0]
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)
xtest_ctv = ctv.transform(xtest)

In [143]:
xtrain_ctv.shape,xvalid_ctv.shape,xtest_ctv.shape

((20000, 4354896), (5000, 4354896), (25000, 4354896))

In [144]:
# word count + tfidf as features
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and validation dataset
# first find all unique words and form a dictionary and then occurence of each word of dictionary is the given data is converted
#data ,dictionary = [w1, w2, w3,w4], text = [w1,w3,w1]==>[2,0,1,0]
ctv.fit(list(xtrain) + list(xvalid))
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)
xtest_tfv = tfv.transform(xtest)

In [145]:
xtrain_tfv.shape,xvalid_tfv.shape,xtest_tfv.shape

((20000, 196770), (25000, 196770))

In [146]:
# Fitting a simple Naive Bayes on Counts
clf_nb_counts = MultinomialNB()
clf_nb_counts.fit(xtrain_ctv, ytrain)
clf_nb_counts.score(xvalid_ctv,yvalid)

0.8728

In [147]:
# Fitting a simple Naive Bayes on TFIDF
clf_nb_tfidf = MultinomialNB()
clf_nb_tfidf.fit(xtrain_tfv, ytrain)
clf_nb_tfidf.score(xvalid_tfv,yvalid)

0.8814

In [148]:
# Fitting a simple Logistic Regression on Counts
clf_lr_counts = LogisticRegression(C=1.0)
clf_lr_counts.fit(xtrain_ctv, ytrain)
clf_lr_counts.score(xvalid_ctv,yvalid)



0.8922

In [149]:
# Fitting a simple Logistic Regression on TFIDF
clf_lr_tfidf = LogisticRegression(C=1.0)
clf_lr_tfidf.fit(xtrain_tfv, ytrain)
clf_lr_tfidf.score(xvalid_tfv,yvalid)

0.8926

In [150]:
# predict the test data label prediction 
target = clf_lr_tfidf.predict(xtest_tfv)

In [151]:
# print target prediction
target

array([1, 0, 1, ..., 1, 1, 1])

In [152]:
# add one more column to submission dataframe
submission['sentiment'] = target

In [153]:
# print top 5 rows of submission dataframe
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [154]:
# print information about submission file
submission.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id           25000 non-null object
sentiment    25000 non-null int64
dtypes: int64(1), object(1)
memory usage: 585.9+ KB


In [155]:
# remove the index file from submission so that it is compatible with kaggle submission 
submission.set_index(['id'], inplace=True)

In [156]:
# save submission dataframe as a csv file
submission.to_csv('target.csv')