# TFIDF + Logistic Regression

In [1]:
import re
import string
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('IMDB_Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df['sentiment'] = df['sentiment'].map({'negative':0, 'positive':1})

### Preprocessing

In [5]:
# Removing <br><br />
def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r" ", text)

df['review'] = df['review'].map(lambda x: remove_html(x))

In [6]:
# Removing punctuation
def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

df['review'] = df['review'].map(lambda x: remove_punct(x))

In [7]:
# Converting into lowercase
df['review'] = df['review'].str.lower()

### Spliting into Train and Test set

In [8]:
from sklearn.model_selection import train_test_split


X = df['review'].values
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=42)

### Feature Extraction

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)

X_train_tfidf = vectorizer.fit_transform(X_train).toarray()

In [10]:
X_test_tfidf = vectorizer.transform(X_test).toarray()

### Training Model

In [11]:
model = LogisticRegression()
model.fit(X_train_tfidf,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
predictions = model.predict(X_test_tfidf)

In [13]:
accuracy_score(predictions,y_test)

0.9026