### Mike Ogrysko
### CS 766 Information Retrieval and Natural Language Processing

Processing the IMDB movie reviews for sentiment analysis
- IMDB movie review data
- Extract keywords that help to differentiate between reviews labeled as sentiment 0 and reviews labeled as sentiment 1 - Tf-Idf features to classify reviews using an SVM classifier
- Rank the first 10 keywords that indicate the difference between the classes 0 and 1
- Cluster the reviews into two groups and classify and report the 10-fold CV classification performance
- Compare keywords generated by Tf-Idf features and keywords generated by clusters

In [1]:
import csv
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
import numpy as np
import pandas as pd
import re
from sklearn.cluster import AgglomerativeClustering
from sklearn.svm import LinearSVC
from sklearn.metrics.cluster import contingency_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC

In [3]:
Reviews, Sentiments = [], []

with open('movie_data.csv','r', encoding='utf8') as fin:
    reader = csv.reader(fin, delimiter=',', quotechar='"')
    header = next(reader)
    for i, line in enumerate(reader):
        Reviews += [line[0]]
        Sentiments +=[int(line[1])]

N=len(Reviews)
M=len(Sentiments)
print('Total reviews loaded', N)
print('Total sentiments loaded', M)

Total reviews loaded 50000
Total sentiments loaded 50000


**Extract keywords that help to differentiate between reviews labeled as sentiment 0 and reviews labeled as sentiment 1 - Tf-Idf features to classify reviews using an SVM classifier**

In [4]:
#combination of stop words and punctuations, also get rid of br
stop_words = stopwords.words('english') + list(punctuation)
stop_words_set = set(stop_words) | set(['br']) | set(['p']) | set(['the']) | set(['this']) | set(['etc'])

#develop tokenizer
def tokenize(text):
    terms = word_tokenize(text)
    #all lower case
    terms = [w.lower() for w in terms]
    #filter stop words
    terms = [w for w in terms if w not in stop_words_set and not w.isdigit()]
    #regex for contractions and other special character strings
    terms = [w for w in terms if not re.search(r'^\W+|\w\'\w+|\'\w+$', w)]
    terms = [w for w in terms if not re.search(r'^[^a-z]+$', w)]
    #regex for words two letters or less and numbers
    terms = [w for w in terms if not re.search(r'^\b\w{1,2}\b|(?<!\S)\d+(?!\S)$', w)]
    #lemmatize
    lemmatizer = WordNetLemmatizer()
    terms = [lemmatizer.lemmatize(w, 'n') for w in terms]
    return terms


In [5]:
all_revs=[str(tokenize(review)) for review in Reviews]

In [6]:
# Convert to numerical np.array which sklearn requires
yCategories = [0, 1]
ydocs = np.array([yCategories.index(_) for _ in Sentiments])

In [7]:
# StratifiedKFold will require indexable data structure
Revs = pd.Series(all_revs)
Sents = pd.Series(Sentiments)

In [8]:
def kfold_eval_docs(_clf, _Xdocs, _ydocs):
    # Need indexable data structure
    acc = []
    kf = StratifiedKFold(n_splits=10, shuffle=False, random_state=None)
    for train_index, test_index in kf.split(_Xdocs, _ydocs):
        _clf.fit(_Xdocs[train_index], _ydocs[train_index])
        y_pred = _clf.predict(_Xdocs[test_index])
        acc += [accuracy_score(_ydocs[test_index], y_pred)]

    return np.array(acc)

In [9]:
#check counts, size of the X dataset - raw features
X_tfidf = TfidfVectorizer().fit_transform(Reviews)
print(f'N data points= {X_tfidf.shape[0]}, M features= {X_tfidf.shape[1]}')

N data points= 50000, M features= 101895


In [10]:
# Use 75000 features instead of 101895
N_FEATURES= 75000

In [11]:
%%time
#svm classifier and accuracy - accuracy tops out at 89.7% when using 75k features
svm_lin = Pipeline([('vect', CountVectorizer(max_features=N_FEATURES)), ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(class_weight='balanced'))
                   ])
acc = kfold_eval_docs(svm_lin, Revs, Sents)
print(f'Support Vector Machine (linear SVC) CV accuracy={np.mean(acc):.3f} {np.std(acc):.3f}')

Support Vector Machine (linear SVC) CV accuracy=0.897 0.003
CPU times: user 38.6 s, sys: 695 ms, total: 39.3 s
Wall time: 39.3 s


**Rank the first 10 keywords that indicate the difference between the classes 0 and 1**

In [12]:
#grab classifier
classifier = svm_lin.named_steps['clf']
#grab coefficients
coef = classifier.coef_[0].ravel()
#get the top coefs for each sentiment
pos_coef = np.argsort(coef)[-10:]
neg_coef = np.argsort(coef)[:10]

In [13]:
#get the feature names
cv = svm_lin.named_steps['vect']
feature_names = cv.get_feature_names_out()


In [14]:
#get sentiment 1 features and coefficients
feat1 = feature_names[pos_coef]
coef1 = coef[pos_coef]

#get sentiment 0 features and coefficients
feat0 = feature_names[neg_coef]
coef0 = coef[neg_coef]


In [15]:
#convert to dicts
feat1_dict = {feat1[i]: coef1[i] for i in range(len(feat1))}
feat0_dict = {feat0[i]: coef0[i] for i in range(len(feat0))}
#sort dicts by value
sort_feat1_dict = dict(sorted(feat1_dict.items(), key=lambda kv:kv[1],reverse=True))
sort_feat0_dict = dict(sorted(feat0_dict.items(), key=lambda kv:kv[1],reverse=False))
#convert dicts to lists
sort_feat1_list = list(sort_feat1_dict.items())
sort_feat0_list = list(sort_feat0_dict.items())


In [16]:
#print top 10 for each sentiment
print(f"\tSentiment 1\t\t\tSentiment 0")
print(f"Importance\tTerm\t\tImportance\tTerm")
for i in range(10):
    print(f"{sort_feat1_list[i][1]:3f}\t{sort_feat1_list[i][0]:16s}{sort_feat0_list[i][1]:3f}\t{sort_feat0_list[i][0]}")
    

	Sentiment 1			Sentiment 0
Importance	Term		Importance	Term
3.877950	excellent       -5.810407	worst
3.152302	great           -4.741094	waste
3.073947	perfect         -4.292880	awful
2.894267	refreshing      -3.662103	boring
2.725053	amazing         -3.512358	disappointment
2.710273	hilarious       -3.411473	fails
2.642047	enjoyable       -3.200393	poor
2.577578	favorite        -3.147187	horrible
2.518989	wonderful       -3.099757	disappointing
2.504800	perfectly       -3.042528	bad


**Cluster the reviews into two groups and classify and report the 10-fold CV classification performance**

In [17]:
# check counts, size of the X dataset - raw features
#cutting Reviews to 15000 - cluster would not run at full size
X_tfidf_4 = TfidfVectorizer().fit_transform(Reviews[0:15000])
print(f'N data points= {X_tfidf_4.shape[0]}, M features= {X_tfidf_4.shape[1]}')

N data points= 15000, M features= 62625


In [18]:
%%time
#build clusters - Agglomerative chosen because performance is better and good for document analysis - long
Clusters = AgglomerativeClustering(n_clusters=2, linkage='ward').fit_predict(np.array(X_tfidf_4.todense()))


CPU times: user 33min 31s, sys: 10.8 s, total: 33min 42s
Wall time: 35min 15s


In [19]:
#size of clusters
count0 = 0
count1 = 0
for i in Clusters:
    if i == 1:
        count1 += 1
    else:
        count0 += 1
print(f"Size of Clusters: \n\nCount 0: {count0}\nCount 1: {count1}")


Size of Clusters: 

Count 0: 14070
Count 1: 930


In [20]:
#create svm pipeline
svm_lin4 = Pipeline([('vect', CountVectorizer(max_features=62625)), ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(class_weight='balanced'))])

In [21]:
%%time
#kfold to get the accuracy
acc = kfold_eval_docs(svm_lin4, Revs[0:15000], Clusters)
print(f'Support Vector Machine (linear SVC) CV accuracy={np.mean(acc):.3f} {np.std(acc):.3f}')

Support Vector Machine (linear SVC) CV accuracy=0.905 0.008
CPU times: user 14 s, sys: 177 ms, total: 14.2 s
Wall time: 14.2 s


**Compare keywords generated by Tf-Idf features and keywords generated by clusters**

In [22]:
#grab classifier
classifier5 = svm_lin4.named_steps['clf']
#grab coefficients
coef5 = classifier5.coef_[0].ravel()
#get the top coefs for each sentiment
pos_coef5 = np.argsort(coef5)[-10:]
neg_coef5 = np.argsort(coef5)[:10]
#get the feature names
cv5 = svm_lin4.named_steps['vect']
feature_names5 = cv5.get_feature_names_out()
#get sentiment 1 features and coefficients
feat15 = feature_names5[pos_coef5]
coef15 = coef5[pos_coef5]
#get sentiment 0 features and coefficients
feat05 = feature_names5[neg_coef5]
coef05 = coef5[neg_coef5]
#convert to dicts
feat1_dict5 = {feat15[i]: coef15[i] for i in range(len(feat15))}
feat0_dict5 = {feat05[i]: coef05[i] for i in range(len(feat05))}
#sort dicts by value
sort_feat1_dict5 = dict(sorted(feat1_dict5.items(), key=lambda kv:kv[1],reverse=True))
sort_feat0_dict5 = dict(sorted(feat0_dict5.items(), key=lambda kv:kv[1],reverse=False))
#convert dicts to lists
sort_feat1_list5 = list(sort_feat1_dict5.items())
sort_feat0_list5 = list(sort_feat0_dict5.items())

#print top 10 for each sentiment
print(f"\t\t\tCluster - Clustering")
print(f"\tSentiment 1\t\t\tSentiment 0")
print(f"Importance\tTerm\t\tImportance\tTerm")
for i in range(10):
    print(f"{sort_feat1_list5[i][1]:3f}\t{sort_feat1_list5[i][0]:16s}{sort_feat0_list5[i][1]:3f}\t{sort_feat0_list5[i][0]} ")

#print top 10 for each sentiment
print(f"\n\t\t\tClassification - TFIDF")
print(f"\tSentiment 1\t\t\tSentiment 0")
print(f"Importance\tTerm\t\tImportance\tTerm")
for i in range(10):
    print(f"{sort_feat1_list[i][1]:3f}\t{sort_feat1_list[i][0]:16s}{sort_feat0_list[i][1]:3f}\t{sort_feat0_list[i][0]}")
    

			Cluster - Clustering
	Sentiment 1			Sentiment 0
Importance	Term		Importance	Term
2.652363	spoiler         -1.410793	michael 
2.425980	movies          -1.378715	working 
2.297623	freddy          -1.352884	asian 
2.244643	bridge          -1.341013	ability 
2.019728	aeon            -1.338794	decision 
2.009459	times           -1.326530	negative 
1.980769	them            -1.293704	clichés 
1.895242	this            -1.287105	early 
1.892824	became          -1.275902	upset 
1.881998	graduate        -1.254799	empty 

			Classification - TFIDF
	Sentiment 1			Sentiment 0
Importance	Term		Importance	Term
3.877950	excellent       -5.810407	worst
3.152302	great           -4.741094	waste
3.073947	perfect         -4.292880	awful
2.894267	refreshing      -3.662103	boring
2.725053	amazing         -3.512358	disappointment
2.710273	hilarious       -3.411473	fails
2.642047	enjoyable       -3.200393	poor
2.577578	favorite        -3.147187	horrible
2.518989	wonderful       -3.099757	disappointing
2.5048