In [1]:
import csv
import math
import random
import numpy as np
import pandas as pd

lines= csv.reader(open("C:\\Users\\Melike Nur Mermer\\imdb_master.csv"))
dataset = list(lines)
df=pd.DataFrame(dataset)
df=df[1:]
df.columns=['index','type','review','label','file']
df=df.drop(columns=['index','file'])
indexNames = df[(df['label'] == 'unsup')].index
df.drop(indexNames , inplace=True)
df

Unnamed: 0,type,review,label
1,test,Once again Mr. Costner has dragged out a movie...,neg
2,test,This is an example of why the majority of acti...,neg
3,test,"First of all I hate those moronic rappers, who...",neg
4,test,Not even the Beatles could write songs everyon...,neg
5,test,Brass pictures (movies is not a fitting word f...,neg
...,...,...,...
49996,train,"Seeing as the vote average was pretty low, and...",pos
49997,train,"The plot had some wretched, unbelievable twist...",pos
49998,train,I am amazed at how this movie(and most others ...,pos
49999,train,A Christmas Together actually came before my t...,pos


In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import PorterStemmer

ps = PorterStemmer()

detokenizer=TreebankWordDetokenizer()

stop_words = set(stopwords.words('english'))

def text_preprocessing(df, stemming, stop_words_removal):
    
    df_train=pd.DataFrame(columns=['type','review','label'])
    df_test=pd.DataFrame(columns=['type','review','label'])
    
    for index, row in df.iterrows():
        sentence=row.review
        typ=row.type
        word_tokens = nltk.wordpunct_tokenize(sentence)
        word_tokens = [w.lower() for w in word_tokens if w.isalnum()]
        
        if stop_words_removal==True:
            filtered_sentence = [w for w in word_tokens if not w in stop_words]
        else:
            filtered_sentence = word_tokens
        
        if stemming==True:
            stemmed_sentence= []
            for w in filtered_sentence:
                stemmed_sentence.append(ps.stem(w))
        else:
            stemmed_sentence = filtered_sentence

        row.review=detokenizer.detokenize(stemmed_sentence)
        
        if typ=='test':
            df_test=df_test.append(row)
        else:
            df_train=df_train.append(row)

    df_test=df_test.drop(columns='type')
    df_train=df_train.drop(columns='type')
    
    return df_train, df_test


[nltk_data] Downloading package stopwords to C:\Users\Melike Nur
[nltk_data]     Mermer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def data_and_labels(df_train, df_test):
    train_data=df_train.review.tolist()
    train_labels=df_train.label.tolist()
    for i in range(len(train_labels)): 
        if train_labels[i]=='pos':
            train_labels[i]=1
        else:
            train_labels[i]=0

    test_data=df_test.review.tolist()
    test_labels=df_test.label.tolist()
    for i in range(len(test_labels)): 
        if test_labels[i]=='pos':
            test_labels[i]=1
        else:
            test_labels[i]=0
    
    return train_data, train_labels, test_data, test_labels

In [4]:
import re

def tokenize(text):
    return re.split("\W+", text)
 
def get_word_counts(words):
    count_word = {}
    for word in words:
        count_word[word] = count_word.get(word, 0.0) + 1.0
    return count_word

In [5]:
num_reviews = {}
log_class_priors = {}
word_counts = {}
vocabulary = set()

def fit(data, labels):
    n = len(labels)
    
    num_reviews['pos'] = sum(1 for label in labels if label == 1)
    num_reviews['neg'] = sum(1 for label in labels if label == 0)
    log_class_priors['pos'] = math.log(num_reviews['pos'] / n)
    log_class_priors['neg'] = math.log(num_reviews['neg'] / n)
    word_counts['pos'] = {}
    word_counts['neg'] = {}
 
    for x, y in zip(data, labels):
        c = 'pos' if y == 1 else 'neg'
        counts = get_word_counts(tokenize(x))
        for word, count in counts.items():
            if word not in vocabulary:
                vocabulary.add(word)
            if word not in word_counts[c]:
                word_counts[c][word] = 0.0
 
            word_counts[c][word] += count

In [6]:
def predict(data):
    result = []
    for x in data:
        counts = get_word_counts(tokenize(x))
        pos_score = 0
        neg_score = 0
        for word, _ in counts.items():
            if word not in vocabulary: continue
            
            # add Laplace smoothing
            log_w_given_pos = math.log( (word_counts['pos'].get(word, 0.0) + 1) / (num_reviews['pos'] + len(vocabulary)) )
            log_w_given_neg = math.log( (word_counts['neg'].get(word, 0.0) + 1) / (num_reviews['neg'] + len(vocabulary)) )
 
            pos_score += log_w_given_pos
            neg_score += log_w_given_neg
 
        pos_score += log_class_priors['pos']
        neg_score += log_class_priors['neg']
 
        if pos_score > neg_score:
            result.append(1)
        else:
            result.append(0)
    return result

In [7]:
def f1_score(pred, labels):
    pred_pos_label_pos = sum(1 for i in range(len(pred)) if pred[i]==1 and pred[i] == labels[i])
    pred_pos_label_neg = sum(1 for i in range(len(pred)) if pred[i]==1 and pred[i] != labels[i])
    pred_neg_label_pos = sum(1 for i in range(len(pred)) if pred[i]==0 and pred[i] != labels[i])
    pred_neg_label_neg = sum(1 for i in range(len(pred)) if pred[i]==0 and pred[i] == labels[i])
    
    # TruePositives / (TruePositives + FalsePositives)
    precision = pred_pos_label_pos /(pred_pos_label_pos + pred_pos_label_neg)
    # TruePositives / (TruePositives + FalseNegatives)
    recall = pred_pos_label_pos / (pred_pos_label_pos + pred_neg_label_pos)
    # (2 * Precision * Recall) / (Precision + Recall)
    return 2*precision*recall / (precision+recall)

In [8]:
df_train, df_test = text_preprocessing(df, stemming=False, stop_words_removal=False)
print(df_train.head())

train_data, train_labels, test_data, test_labels = data_and_labels(df_train, df_test)

fit(train_data, train_labels)
pred = predict(test_data)
print("F1-Score without stop-words removal and stemming..:"+str(f1_score(pred, test_labels)))

                                                  review label
25001  story of a man who has unnatural feelings for ...   neg
25002  airport 77 starts as a brand new luxury 747 pl...   neg
25003  this film lacked something i couldn t put my f...   neg
25004  sorry everyone i know this is supposed to be a...   neg
25005  when i was little my parents took me along to ...   neg
F1-Score without stop-words removal and stemming..:0.8306334035315083


In [8]:
df_train, df_test = text_preprocessing(df, stemming=False, stop_words_removal=True)
print(df_train.head())

train_data, train_labels, test_data, test_labels = data_and_labels(df_train, df_test)

fit(train_data, train_labels)
pred = predict(test_data)
print("F1-Score with stop-words removal without stemming..:"+str(f1_score(pred, test_labels)))

                                                  review label
25001  story man unnatural feelings pig starts openin...   neg
25002  airport 77 starts brand new luxury 747 plane l...   neg
25003  film lacked something put finger first charism...   neg
25004  sorry everyone know supposed art film wow hand...   neg
25005  little parents took along theater see interior...   neg
F1-Score with stop-words removal without stemming..:0.8425412205985778


In [8]:
df_train, df_test = text_preprocessing(df, stemming=True, stop_words_removal=True)
print(df_train.head())

train_data, train_labels, test_data, test_labels = data_and_labels(df_train, df_test)

fit(train_data, train_labels)
pred = predict(test_data)
print("F1-Score with stop-words removal and stemming..:"+str(f1_score(pred, test_labels)))

                                                  review label
25001  stori man unnatur feel pig start open scene te...   neg
25002  airport 77 start brand new luxuri 747 plane lo...   neg
25003  film lack someth put finger first charisma par...   neg
25004  sorri everyon know suppos art film wow hand gu...   neg
25005  littl parent took along theater see interior o...   neg
F1-Score with stop-words removal and stemming..:0.8371175492205932
