# Classification Scikit-learn

Input (x) --> Review /
Output (y) --> Sentiment

In [3]:
import pandas as pd

In [4]:
df_review = pd.read_csv('IMDB Dataset.csv')

In [5]:
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
# select data in the dataset
df_positive = df_review[df_review['sentiment']=='positive'][:9000]
df_negative = df_review[df_review['sentiment']=='negative'][:1000]

In [7]:
# merge data
df_review_des = pd.concat([df_negative, df_positive])

In [8]:
df_review_des.value_counts('sentiment') # 10.000 data

sentiment
positive    9000
negative    1000
Name: count, dtype: int64

## Balance Dataset

In [9]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment'] = rus.fit_resample(df_review_des[['review']], df_review_des['sentiment'])

In [10]:
df_review_bal.value_counts('sentiment')

sentiment
negative    1000
positive    1000
Name: count, dtype: int64

### Separate data for training

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

In [12]:
train # 67%

Unnamed: 0,review,sentiment
144,"I just got back from this free screening, and ...",negative
1839,Claire Denis's movies seem to fall into one of...,negative
9298,"We saw the silent version of this film, and it...",positive
742,Mario Lewis of the Competitive Enterprise Inst...,negative
6818,"In one of the better movies of the year, Tom H...",positive
...,...,...
1849,If you speak French or can put up with sub-tit...,positive
2534,The year 1983 saw a strange phenomenon; two ri...,positive
1739,"Obviously, the comments above that fawn over t...",negative
11604,The first bottom movie was an absolute laugh f...,positive


In [13]:
test # 33%

Unnamed: 0,review,sentiment
8152,"In a future society, the military component do...",positive
681,Never saw the original movie in the series...I...,negative
17622,Clint Eastwood returns as Dirty Harry Calahan ...,positive
1821,"At the beginning of the film, you might double...",negative
273,Fulci... Does this man brings one of the gorie...,positive
...,...,...
207,"I have seen most, if not all of the Laurel & H...",negative
17979,Insanely well crafted mini-series.<br /><br />...,positive
6044,We loved this movie because it was so entertai...,positive
1051,I rented this movie with my friend for a good ...,negative


In [14]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

## Bag of Words

- CountVectorizer: frequency with which a word appears
- Rfidf: relevance of a word in a sentence - "better option"

In [15]:
# transform text data to numeric data
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
test_x_vector = tfidf.transform(test_x)

In [16]:
train_x_vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 116318 stored elements and shape (1340, 20378)>

## Machine Learning Algorithms

- Select model:
    1. Supervised Learning: Regresion (numeric output), qualification (output discreto)
        - Input: Review
        - Output: Sentiment (discrete)
    2. Unsupervised Learning

### Support Vector Machines (SVM)

In [17]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y)

In [20]:
# testing
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['"I did not like this movie at all I gave this movie'])))

['positive']
['positive']
['negative']


### Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

### Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

### Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x_vector, train_y)

## MODEL EVALUATION

### Calculate Score

In [None]:
print(svc.score(test_x_vector, test_y)) # best model
print(dec_tree.score(test_x_vector, test_y))
print(gnb.score(test_x_vector.toarray(), test_y))
print(lr.score(test_x_vector, test_y))

0.8424242424242424
0.6727272727272727
0.6166666666666667
0.8257575757575758


### F1 score

F1 score = 2*(Recall * Precision) / (Recall + Precision)

In [29]:
from sklearn.metrics import f1_score

f1_score(test_y, svc.predict(test_x_vector), labels=['positive', 'negative'], average=None)

array([0.84569733, 0.83900929])

### Report Qualification

In [31]:
from sklearn.metrics import classification_report

print(classification_report(test_y, svc.predict(test_x_vector), labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.84      0.85      0.85       335
    negative       0.84      0.83      0.84       325

    accuracy                           0.84       660
   macro avg       0.84      0.84      0.84       660
weighted avg       0.84      0.84      0.84       660



### Confusion Matrix

In [33]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(test_y, svc.predict(test_x_vector), labels=['positive', 'negative']))

[[285  50]
 [ 54 271]]
