# Classification Scikit-learn

Input (x) --> Review /
Output (y) --> Sentiment

In [1]:
import pandas as pd

In [3]:
df_review = pd.read_csv('IMDB Dataset.csv')

In [4]:
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
# select data in the dataset
df_positive = df_review[df_review['sentiment']=='positive'][:9000]
df_negative = df_review[df_review['sentiment']=='negative'][:1000]

In [7]:
# merge data
df_review_des = pd.concat([df_negative, df_positive])

In [9]:
df_review_des.value_counts('sentiment') # 10.000 data

sentiment
positive    9000
negative    1000
Name: count, dtype: int64

## Balance Dataset

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment'] = rus.fit_resample(df_review_des[['review']], df_review_des['sentiment'])

In [14]:
df_review_bal.value_counts('sentiment')

sentiment
negative    1000
positive    1000
Name: count, dtype: int64

### Separate data for training

In [17]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

In [None]:
train # 67%

Unnamed: 0,review,sentiment
144,"I just got back from this free screening, and ...",negative
1839,Claire Denis's movies seem to fall into one of...,negative
3721,"Any story comprises a premise, characters and ...",positive
742,Mario Lewis of the Competitive Enterprise Inst...,negative
15980,It would be unwise to judge that that either n...,positive
...,...,...
4055,I finally got hold of Lifeforce on DVD with th...,positive
5004,"This is a very good, under-rated action/drama/...",positive
1739,"Obviously, the comments above that fawn over t...",negative
9097,This is one of the greatest films ever made. B...,positive


In [None]:
test # 33%

Unnamed: 0,review,sentiment
12858,"I was wandering through my local library, brow...",positive
681,Never saw the original movie in the series...I...,negative
17672,Just finished watching the movie and wanted to...,positive
1821,"At the beginning of the film, you might double...",negative
5728,"I loved it. In fact, I watched it over and ove...",positive
...,...,...
207,"I have seen most, if not all of the Laurel & H...",negative
15798,"As a rule, there are few things more dispiriti...",positive
6142,Despite this production having received a numb...,positive
1051,I rented this movie with my friend for a good ...,negative


In [20]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

## Bag of Words

- CountVectorizer: frequency with which a word appears
- Rfidf: relevance of a word in a sentence - "better option"

In [21]:
# transform text data to numeric data
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
test_x_vector = tfidf.transform(test_x)

In [22]:
train_x_vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 115294 stored elements and shape (1340, 20085)>

## Machine Learning Algorithms