# Classification Scikit-learn

Input (x) --> Review /
Output (y) --> Sentiment

In [34]:
import pandas as pd

In [35]:
df_review = pd.read_csv('IMDB Dataset.csv')

In [36]:
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [37]:
# select data in the dataset
df_positive = df_review[df_review['sentiment']=='positive'][:9000]
df_negative = df_review[df_review['sentiment']=='negative'][:1000]

In [38]:
# merge data
df_review_des = pd.concat([df_negative, df_positive])

In [39]:
df_review_des.value_counts('sentiment') # 10.000 data

sentiment
positive    9000
negative    1000
Name: count, dtype: int64

## Balance Dataset

In [40]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment'] = rus.fit_resample(df_review_des[['review']], df_review_des['sentiment'])

In [41]:
df_review_bal.value_counts('sentiment')

sentiment
negative    1000
positive    1000
Name: count, dtype: int64

### Separate data for training

In [42]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

In [43]:
train # 67%

Unnamed: 0,review,sentiment
144,"I just got back from this free screening, and ...",negative
1839,Claire Denis's movies seem to fall into one of...,negative
8445,"I'm glad I read the Sarah Waters novel first, ...",positive
742,Mario Lewis of the Competitive Enterprise Inst...,negative
9671,Director John Schlesinger's tense and frantic ...,positive
...,...,...
2604,Crackerjack is another classic Aussie film. As...,positive
6424,"this was the most costly film, when produced. ...",positive
1739,"Obviously, the comments above that fawn over t...",negative
3546,I've been writing hardboiled crime fiction for...,positive


In [44]:
test # 33%

Unnamed: 0,review,sentiment
12516,In 1967 I visited the Lake Elsinore glider-por...,positive
681,Never saw the original movie in the series...I...,negative
45,"As a disclaimer, I've seen the movie 5-6 times...",positive
1821,"At the beginning of the film, you might double...",negative
3670,too bad they showed palm trees that could not ...,positive
...,...,...
207,"I have seen most, if not all of the Laurel & H...",negative
5413,On paper this looked like a great concept: Ave...,positive
15522,"The plot:Kurt Harris (Jeff Wincott), a bitter,...",positive
1051,I rented this movie with my friend for a good ...,negative


In [45]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

## Bag of Words

- CountVectorizer: frequency with which a word appears
- Rfidf: relevance of a word in a sentence - "better option"

In [46]:
# transform text data to numeric data
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
test_x_vector = tfidf.transform(test_x)

In [47]:
train_x_vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 115420 stored elements and shape (1340, 20065)>

## Machine Learning Algorithms

- Select model:
    1. Supervised Learning: Regresion (numeric output), qualification (output discreto)
        - Input: Review
        - Output: Sentiment (discrete)
    2. Unsupervised Learning

### Support Vector Machines (SVM)

In [48]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y)

In [49]:
# testing
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['"I did not like this movie at all I gave this movie'])))

['positive']
['positive']
['negative']


### Decision Tree

In [50]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

### Naive Bayes

In [51]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

### Logistic Regression

In [52]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x_vector, train_y)

## MODEL EVALUATION

### Calculate Score

In [53]:
print(svc.score(test_x_vector, test_y)) # best model
print(dec_tree.score(test_x_vector, test_y))
print(gnb.score(test_x_vector.toarray(), test_y))
print(lr.score(test_x_vector, test_y))

0.8242424242424242
0.6484848484848484
0.6287878787878788
0.8181818181818182


### F1 score

F1 score = 2*(Recall * Precision) / (Recall + Precision)

In [54]:
from sklearn.metrics import f1_score

f1_score(test_y, svc.predict(test_x_vector), labels=['positive', 'negative'], average=None)

array([0.82890855, 0.81931464])

### Report Qualification

In [55]:
from sklearn.metrics import classification_report

print(classification_report(test_y, svc.predict(test_x_vector), labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.82      0.84      0.83       335
    negative       0.83      0.81      0.82       325

    accuracy                           0.82       660
   macro avg       0.82      0.82      0.82       660
weighted avg       0.82      0.82      0.82       660



### Confusion Matrix

In [56]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(test_y, svc.predict(test_x_vector), labels=['positive', 'negative']))

[[281  54]
 [ 62 263]]


## Model Optimization

### GridSearchCV

In [58]:
from sklearn.model_selection import GridSearchCV

parameters = {'C':[1,4,8,16,32], 'kernel':['linear', 'rbf']}
svc = SVC()
svc_grid = GridSearchCV(svc, parameters, cv=5)
svc_grid.fit(train_x_vector, train_y)

In [59]:
print(svc_grid.best_estimator_)
print(svc_grid.best_params_)

SVC(C=4)
{'C': 4, 'kernel': 'rbf'}


In [61]:
svc_grid.best_score_

np.float64(0.8261194029850746)