# MovieReviewClassification

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

In [2]:
train= pd.read_table('train.tsv')

In [3]:
train.head(4)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2


In [4]:
test= pd.read_table('test.tsv')

In [5]:
test.head(n=4)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort


In [6]:
train['Sentiment'].unique()

array([1, 2, 3, 4, 0])

In [7]:
train['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [8]:
train['SentenceId'].unique()

array([   1,    2,    3, ..., 8542, 8543, 8544])

In [9]:
train['SentenceId'].value_counts()

1       63
5555    63
509     59
625     58
403     57
22      56
149     56
240     55
7705    54
128     54
817     54
2124    53
82      53
301     53
595     52
654     52
3635    51
635     51
217     51
3023    51
4563    50
1032    50
293     50
113     50
152     50
1020    49
1152    49
3010    49
3084    49
702     49
        ..
903      1
6335     1
7572     1
2887     1
7227     1
5997     1
7904     1
7906     1
6733     1
6774     1
7508     1
5518     1
8477     1
4516     1
1710     1
7330     1
6980     1
6282     1
325      1
4579     1
4982     1
3948     1
4486     1
5679     1
1879     1
8451     1
5718     1
3560     1
1666     1
1178     1
Name: SentenceId, dtype: int64

### lets split the train data into dependent and Independent Columns 

In [10]:
y= train['Sentiment']

In [11]:
X= train['Phrase']

In [12]:
y.shape

(156060,)

In [13]:
X.shape

(156060,)

### Spliting the data into train/test

In [14]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=42)

In [15]:
vect= CountVectorizer()

In [16]:
X_train_t= vect.fit_transform(X_train)

In [17]:
X_train_t

<117045x15224 sparse matrix of type '<class 'numpy.int64'>'
	with 730191 stored elements in Compressed Sparse Row format>

In [18]:
X_test_t= vect.transform(X_test)

### Building Models 

### 1.Naive Bayes 

In [19]:
clf = MultinomialNB()

In [20]:
clf.fit(X_train_t, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
y_pred= clf.predict(X_test_t)

In [22]:
clf.score(X_test_t, y_test)

0.6105856721773677

In [23]:
confusion_matrix(y_test, y_pred)

array([[  517,   860,   314,    45,     4],
       [  476,  3030,  2960,   359,    29],
       [  151,  1859, 15315,  2125,   185],
       [   21,   349,  3232,  4173,   609],
       [    2,    36,   356,  1221,   787]])

In [24]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.44      0.30      0.36      1740
          1       0.49      0.44      0.47      6854
          2       0.69      0.78      0.73     19635
          3       0.53      0.50      0.51      8384
          4       0.49      0.33      0.39      2402

avg / total       0.60      0.61      0.60     39015



### 2. LogisticRegression

In [25]:
log= LogisticRegression()

In [26]:
log.fit(X_train_t, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
y_pred1= log.predict(X_test_t)

In [28]:
log.score(X_test_t,y_test)

0.63801102140202481

** Lets do some parameter tuning on LogisticRegression**

In [29]:
param_grid= {'penalty':['l1','l2'],
            'C':[0.001,0.01,0.1,1,10]}

In [30]:
log_cv = GridSearchCV(log, param_grid)

In [31]:
log_cv.fit(X_train_t, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [32]:
y_pred2= log_cv.predict(X_test_t)

In [33]:
log_cv.score(X_test_t, y_test)

0.63744713571703193

In [34]:
log_cv.best_params_

{'C': 1, 'penalty': 'l1'}

### 3. SVM

In [35]:
clf1=SGDClassifier()

In [36]:
clf1.fit(X_train_t, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [37]:
y_pred3= clf1.predict(X_test_t)

In [38]:
clf1.score(X_test_t, y_test)

0.61081635268486478

In [39]:
confusion_matrix(y_test, y_pred3)

array([[  360,   720,   588,    58,    14],
       [  209,  1995,  4312,   291,    47],
       [   40,   848, 17811,   856,    80],
       [    9,   271,  4679,  2965,   460],
       [    3,    38,   553,  1108,   700]])

In [40]:
print(classification_report(y_test, y_pred3))

             precision    recall  f1-score   support

          0       0.58      0.21      0.30      1740
          1       0.52      0.29      0.37      6854
          2       0.64      0.91      0.75     19635
          3       0.56      0.35      0.43      8384
          4       0.54      0.29      0.38      2402

avg / total       0.59      0.61      0.57     39015



### Predicting on test data set 

In [41]:
test1= test['Phrase']

In [42]:
test1.shape

(66292,)

In [43]:
test1_t= vect.transform(test1)

In [44]:
new_prediction= log.predict(test1_t)

In [45]:
new_prediction

array([3, 3, 2, ..., 1, 1, 2])

In [46]:
submission = pd.DataFrame({'PhraseId': test['PhraseId'],
                            'Sentiment':new_prediction})

In [47]:
submission.head(n=10)

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3
5,156066,3
6,156067,3
7,156068,2
8,156069,3
9,156070,2


### Conclusion:

** Here 0- bad review, 1- some what bad, 2- Average, 3- good, 4- very good **