## Sentiment analysis of IMDB movie reviews

In [55]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kseniaksenofontova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kseniaksenofontova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
df_texts = pd.Series(pd.read_fwf('train.texts', sep="\n", header=None)[0], name='review')
df_labels = pd.Series(pd.read_fwf('train.labels', sep="\n", header=None)[0], name='sentiment')
df = pd.concat([df_texts, df_labels], axis=1)
df_x_test = pd.read_csv('texts.csv')['texts'].to_frame()

df.replace("pos", 1, inplace=True)
df.replace("neg", 0, inplace=True)

df.head()

Unnamed: 0,review,sentiment
0,If the myth regarding broken mirrors would be ...,0
1,I gave this movie a 10 because it needed to be...,1
2,After watching the first 20mn of Blanche(sorry...,0
3,"Weak plot, unlikely car malfunction, and helpl...",0
4,Where the Sidewalk Ends (1950)<br /><br />Wher...,1


In [9]:
df['sentiment'].value_counts()

1    7520
0    7480
Name: sentiment, dtype: int64

Text processing

In [25]:
def data_processing(text):
    text= text.lower()
    text = re.sub('<br />', '', text)
    text = re.sub(r"https\S+|www\S+|http\S+", '', text, flags = re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text_tokens = word_tokenize(text)
    filtered_text = [w for w in text_tokens if w not in stop_words]
    return " ".join(filtered_text)

In [37]:
df.review = df['review'].apply(data_processing)
df_x_test.texts = df_x_test['texts'].apply(data_processing)

In [38]:
df = df.drop_duplicates('review')

In [40]:
df

Unnamed: 0,review,sentiment
0,myth regarding broken mirrors would accurate e...,0
1,gave movie 10 needed rewarded scary elements a...,1
2,watching first 20mn blanchesorry couldnt take ...,0
3,weak plot unlikely car malfunction helpless fu...,0
4,sidewalk ends 1950where one ends another begin...,1
...,...,...
14995,rented movie roughly 45 years ago instantly di...,0
14996,put aside dr house repeat missed desperate hou...,0
14997,fans goremeister herschell gordon lewis look e...,0
14998,create reality core question behind highly ori...,1


Text stemming

In [41]:
stemmer = PorterStemmer()
def stemming(data):
    text = ' '.join([stemmer.stem(word) for word in data.split()])
    return text

In [42]:
df['review'] = df['review'].apply(lambda x: stemming(x))
df_x_test['texts'] = df_x_test['texts'].apply(lambda x: stemming(x))

In [47]:
df.head()

Unnamed: 0,review,sentiment
0,myth regard broken mirror would accur everybod...,0
1,gave movi 10 need reward scari element actor g...,1
2,watch first 20mn blanchesorri couldnt take con...,0
3,weak plot unlik car malfunct helpless fumbl ch...,0
4,sidewalk end 1950where one end anoth beginsthi...,1


Term Frequency-Inverse Document Frequency model (TFIDF) vectorization

In [51]:
x = df['review']
y = df['sentiment']
x_test_final = df_x_test['texts']

vect = TfidfVectorizer(max_df=0.8, ngram_range=(1,3))
x = vect.fit_transform(x)
x_test_final = vect.transform(x_test_final)

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [54]:
print("Size of x_train: ", (x_train.shape))
print("Size of y_train: ", (y_train.shape))
print("Size of x_test: ", (x_test.shape))
print("Size of y_test: ", (y_test.shape))

Size of x_train:  (11952, 2788359)
Size of y_train:  (11952,)
Size of x_test:  (2988, 2788359)
Size of y_test:  (2988,)


Modeling the dataset

In [56]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_pred = logreg.predict(x_test)
logreg_acc = accuracy_score(logreg_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

Test accuracy: 86.41%


In [57]:
from sklearn.linear_model import PassiveAggressiveClassifier

passive = PassiveAggressiveClassifier()
passive.fit(x_train, y_train)
passive_pred = passive.predict(x_test)
passive_acc = accuracy_score(passive_pred, y_test)
print("Test accuracy: {:.2f}%".format(passive_acc*100))

pred_final = passive.predict(x_test_final)

Test accuracy: 87.99%


In [58]:
y_pred = pd.DataFrame(data=range(0,10000))
y_pred.columns = ['id']
y_pred['labels'] = pred_final
y_pred.labels.replace(1, 'pos', inplace=True)
y_pred.labels.replace(0, 'neg', inplace=True)
y_pred.to_csv('y_pred.csv', index=False)