## Step1: Cleaning

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

#clean a review
def getCleanText(text):
    text=text.lower()
    text=text.replace("<br /><br />"," ")
    
    #Init objects
    tokenizer=RegexpTokenizer(r'\w+')
    en_stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
                  "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
                  'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
                  'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
                  'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
                  'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
                  'does', 'did', 'doing', 'a', 'an', 'the', 'until', 'while', 'of', 'at',
                  'by', 'for', 'with', 'about', 'into', 'through', 'during', 'before',
                  'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
                  'on', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 
                  'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
                  'most', 'other', 'some', 'such', 'own', 'same', 'so', 'than', 'too', 
                  'very', 's', 't', 'can', 'will', 'just', 'should', "should've", 'now',
                  'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'ma', 'shan', "shan't"]
    l=WordNetLemmatizer()
    
    #Tokenize
    tokens=tokenizer.tokenize(text)
    new_tokens=[token for token in tokens if token not in en_stopwords]
    stemmed_tokens=[l.lemmatize(token) for token in new_tokens]
    cleaned_text=' '.join(stemmed_tokens)
    return cleaned_text

In [2]:
#get train dataset
import pandas as pd
import numpy as np
df=pd.read_csv("Train.csv")
print(type(df),df.shape)

<class 'pandas.core.frame.DataFrame'> (40000, 2)


In [3]:
train_data=df.values
print(type(train_data),train_data.shape)
X_train=train_data[:,0]
Y_train=train_data[:,1]
print(X_train.shape,Y_train.shape)

<class 'numpy.ndarray'> (40000, 2)
(40000,) (40000,)


In [4]:
X_clean=[getCleanText(i) for i in X_train]

In [5]:
#load test dataset
df=pd.read_csv("Test.csv")
print(type(df),df.shape)

<class 'pandas.core.frame.DataFrame'> (10000, 1)


In [6]:
X_test=df.values.reshape((-1,))
X_test_clean=[getCleanText(i) for i in X_test]

## Step2: Vectorization

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer(ngram_range=(1,3))
x_vec=cv.fit_transform(X_clean)
print(x_vec.shape)

(40000, 7050320)


In [8]:
xt_vec=cv.transform(X_test_clean)
print(xt_vec.shape)

(10000, 7050320)


## Step3: Train Model

In [9]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB(alpha=0.01)
mnb.fit(x_vec,Y_train)
#predictions
yt_pred=mnb.predict(xt_vec)

In [10]:
print(yt_pred[:10])

['neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos']


In [11]:
y_pred=mnb.predict(x_vec)
print(np.sum(Y_train==y_pred)/Y_train.shape[0])

1.0


In [12]:
ans=pd.DataFrame(yt_pred,columns=['Label'])
ans.index.name='Id'
ans.to_csv('submit.csv',index=True) 