In [20]:
import pandas as pd
import numpy as np
import os
import glob
import codecs
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics  import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eynatgrof/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/eynatgrof/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [29]:
path = os.getcwd() + '/aclImdb'
print(path)

/Users/eynatgrof/Documents/projects/imdb/aclImdb


# Load the data

In [22]:
path_train_neg = path + '/train/neg/*txt'
path_train_pos = path + '/train/pos/*txt'
path_test_neg = path + '/test/neg/*txt'
path_test_pos = path +'/test/pos/*txt'
train_neg = []
train_pos = []
test_neg = []
test_pos = []

paths = [path_train_neg, path_train_pos, path_test_neg, path_test_pos]
lists = [train_neg, train_pos, test_neg, test_pos]

for i in range(len(paths)):
        path = paths[i]
        lst = lists[i]
        
        files = glob.glob(path)
        
        for name in files:
            try:
                with open(name) as f:
                    file = [line.split() for line in f]
                    flat_file = [item for item in file for item in item]
                    lst.append(flat_file)
            except:
                print('error with', name)
                

# Stemming

In [23]:
def stem_func(reviews_lst):
    stemmer = PorterStemmer()
    revised_lst = []
    count = 0
    for review in reviews_lst:
        temp_lst = []
        for word in range(len(review)):
            temp_lst.append(stemmer.stem(review[word]))
        revised_lst.append(temp_lst)
        count += 1
        print(count, end = '\r')
    print('done')
    return revised_lst
        

In [24]:
train_neg = stem_func(train_neg)
train_pos = stem_func(train_pos)
test_neg = stem_func(test_neg)
test_pos = stem_func(test_pos)

done0
done0
done0
done0


# Removing stop words and punctuation

In [25]:
def stop_func(reviews_lst):
    stop_words = set(stopwords.words('english'))
    stop_words.add('/><br')
    
    revised_lst = []
    count = 0
    for review in reviews_lst:
        temp_lst = []
        for word in range(len(review)):
            if review[word] not in stop_words:
                temp_lst.append(review[word])
            for word in range(len(temp_lst)):
                temp_lst[word] = temp_lst[word].translate(str.maketrans('','', string.punctuation))
        count += 1
        print(count, end = '\r')
        revised_lst.append(temp_lst)
    print('done')
    return revised_lst
    

In [26]:
train_neg = stop_func(train_neg)
train_pos = stop_func(train_pos)
test_neg = stop_func(test_neg)
test_pos = stop_func(test_pos)

done0
done0
done0
done0


# Replicate dictionary method

## Load dictionary negative and positive words

In [27]:
def isNotNull(value):
    return value is not None and len(value)>0

In [30]:
dict_pos = []
dict_neg = []
f = open('negative-words.txt','r', encoding = 'ISO-8859-1')
for line in f:
    t = line.strip().lower();
    if (isNotNull(t)):
        dict_neg.append(t)
f.close()

f = open('positive-words.txt','r', encoding = 'ISO-8859-1')
for line in f:
    t = line.strip().lower();
    if (isNotNull(t)):
        dict_pos.append(t)
f.close()

In [31]:
print(dict_pos[:5])
print(dict_neg[:5])

['a+', 'abound', 'abounds', 'abundance', 'abundant']
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']


## Dictionary analysis

In [32]:
df1 = pd.DataFrame()
df1['review'] = test_neg
df2 = pd.DataFrame()
df2['review'] = test_pos

frames = [df1, df2]
data = pd.concat(frames)
data['label'] = [0]*len(test_neg) + [1]*len(test_pos)
print(len(data))
data.head()

25000


Unnamed: 0,review,label
0,"[alan, rickman, , emma, thompson, give, good, ...",0
1,"[I, seen, thi, movi, I, care, thi, movi, anyho...",0
2,"[In, lo, angeles, alcohol, lazi, hank, chinask...",0
3,"[thi, film, bundl, along, gli, fumavano, le, c...",0
4,"[I, onli, comment, realli, veri, good, film, u...",0


In [33]:
analysis_for_all = []
count = 0
for i in range(len(data)):
    tokens = data.iloc[i,0]
    neg_cnt = 0
    pos_cnt = 0
    for neg in dict_neg:
        if (neg in tokens):
            neg_cnt = neg_cnt +1
    for pos in dict_pos:
        if (pos in tokens):
            pos_cnt = pos_cnt +1
    analysis_for_all.append(pos_cnt - neg_cnt)   
    count += 1
    print(count, end = '\r')
data['Bing_analysis_for_all'] = analysis_for_all

25000

In [34]:
data.head()

Unnamed: 0,review,label,Bing_analysis_for_all
0,"[alan, rickman, , emma, thompson, give, good, ...",0,4
1,"[I, seen, thi, movi, I, care, thi, movi, anyho...",0,1
2,"[In, lo, angeles, alcohol, lazi, hank, chinask...",0,-2
3,"[thi, film, bundl, along, gli, fumavano, le, c...",0,-9
4,"[I, onli, comment, realli, veri, good, film, u...",0,-3


In [35]:
analysis_label = []
for i in analysis_for_all:
    if i >0:
        analysis_label.append(1)
    else:
        analysis_label.append(0)

data['analysis_label'] = analysis_label
data.head()

Unnamed: 0,review,label,Bing_analysis_for_all,analysis_label
0,"[alan, rickman, , emma, thompson, give, good, ...",0,4,1
1,"[I, seen, thi, movi, I, care, thi, movi, anyho...",0,1,1
2,"[In, lo, angeles, alcohol, lazi, hank, chinask...",0,-2,0
3,"[thi, film, bundl, along, gli, fumavano, le, c...",0,-9,0
4,"[I, onli, comment, realli, veri, good, film, u...",0,-3,0


In [36]:
Bing_analysis = data.analysis_label.tolist()
True_label = data.label.tolist()
confusion_matrix(True_label,Bing_analysis)

array([[9280, 3220],
       [4107, 8393]])

In [37]:
print(classification_report(True_label,Bing_analysis))

              precision    recall  f1-score   support

           0       0.69      0.74      0.72     12500
           1       0.72      0.67      0.70     12500

    accuracy                           0.71     25000
   macro avg       0.71      0.71      0.71     25000
weighted avg       0.71      0.71      0.71     25000



# Choose a different ML method
## Naive Bayes

In [38]:
train_pos_nb = []
for i in range(len(train_pos)):
    temp = " ".join(train_pos[i])
    train_pos_nb.append(temp)
    
train_neg_nb = []
for i in range(len(train_neg)):
    temp = " ".join(train_neg[i])
    train_neg_nb.append(temp)
    
test_pos_nb = []
for i in range(len(test_pos)):
    temp = " ".join(test_pos[i])
    test_pos_nb.append(temp)
    
test_neg_nb = []
for i in range(len(test_neg)):
    temp = " ".join(test_neg[i])
    test_neg_nb.append(temp)

In [39]:
x_train = train_pos_nb + train_neg_nb
y_train = [1]*12500 + [0]*12500
x_test = test_pos_nb + test_neg_nb
y_test = [1]*12500 + [0]*12500

In [40]:
xt = np.asarray(x_train)
type(xt[1])

numpy.str_

In [41]:
vectorizer = TfidfVectorizer(max_features=10000)
train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)
print(train_vectors.shape, test_vectors.shape)

(25000, 10000) (25000, 10000)


In [42]:
clf = MultinomialNB().fit(train_vectors, y_train)
predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))

0.83472


# Compare results for dictionary vs. NB

In [43]:
print('Dictionary Method:')
print(classification_report(True_label,Bing_analysis))
print('Naive Bayes:')
print(classification_report(y_test,predicted))

Dictionary Method:
              precision    recall  f1-score   support

           0       0.69      0.74      0.72     12500
           1       0.72      0.67      0.70     12500

    accuracy                           0.71     25000
   macro avg       0.71      0.71      0.71     25000
weighted avg       0.71      0.71      0.71     25000

Naive Bayes:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84     12500
           1       0.86      0.80      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.84      0.83      0.83     25000
weighted avg       0.84      0.83      0.83     25000

