In [3]:
import re, nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('wordnet', quiet=True)
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import joblib
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Reading dataset as dataframe
df = pd.read_csv("Labelled.csv",encoding='ISO-8859-1')
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window


In [5]:
df['Sentiment'] = df['Sentiment'].map({'positive':1, 'negative':0})

In [6]:
print(df.head())

Unnamed: 0,Review,Sentiment
0,The acting in this movie was absolutely incred...,1
1,I couldn't stand the plot of this movie. It fe...,0
2,The cinematography in this film was stunning. ...,1
3,I found the dialogue in this movie to be compl...,0
4,I laughed so hard during this movie. It was th...,1


In [7]:
#df["Sentiment"] = df["Sentiment"].astype(int)
# df = pd.read_csv("Labelled.csv",encoding='ISO-8859-1')
# df['Sentiment'] = df['Sentiment'].map({'positive':1, 'negative':0})
# df['cleaned_reviews'] = df.Review.apply(cleaner)
# df = df[df['cleaned_reviews'].map(len) > 0] # removing rows with cleaned tweets of length 0
# df['cleaned_reviews'] = [" ".join(row) for row in df['cleaned_reviews'].values] 
# data = df['cleaned_reviews']

In [8]:
print(df.isna().sum())

Review       0
Sentiment    0
dtype: int64

In [9]:
df['Sentiment'].value_counts()

1    598
0    542
Name: Sentiment, dtype: int64

In [10]:
print(df['Sentiment'].value_counts()) #checking to see counts of the target label; the dataset is somewhata balanced

1    598
0    542
Name: Sentiment, dtype: int64


In [14]:
# Cleaning reviews
def cleaner(reviews):
    soup = BeautifulSoup(reviews, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(#|@|http://|https://|www)\S*", " ", souped) # substituting hashtags, @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [15]:
df['cleaned_reviews'] = df.Review.apply(cleaner)

In [16]:
print(df.head())

Unnamed: 0,Review,Sentiment,cleaned_reviews
0,The acting in this movie was absolutely incred...,1,"[acting, movie, absolutely, incredible, blown,..."
1,I couldn't stand the plot of this movie. It fe...,0,"[stand, plot, movie, felt, like, waste, time, ..."
2,The cinematography in this film was stunning. ...,1,"[cinematography, film, stunning, could, watch]"
3,I found the dialogue in this movie to be compl...,0,"[found, dialogue, movie, completely, unrealist..."
4,I laughed so hard during this movie. It was th...,1,"[laughed, hard, movie, perfect, comedy]"


In [17]:
df = df[df['cleaned_reviews'].map(len) > 0] # removing rows with cleaned movie reviews of length 0

In [18]:
print("Printing top 5 rows of dataframe showing original and cleaned moviev reviews...")
print(df[['Review','cleaned_reviews']].head())

Printing top 5 rows of dataframe showing original and cleaned moviev reviews...
                                              Review  \
0  The acting in this movie was absolutely incred...   
1  I couldn't stand the plot of this movie. It fe...   
2  The cinematography in this film was stunning. ...   
3  I found the dialogue in this movie to be compl...   
4  I laughed so hard during this movie. It was th...   

                                     cleaned_reviews  
0  [acting, movie, absolutely, incredible, blown,...  
1  [stand, plot, movie, felt, like, waste, time, ...  
2     [cinematography, film, stunning, could, watch]  
3  [found, dialogue, movie, completely, unrealist...  
4            [laughed, hard, movie, perfect, comedy]  


In [19]:
print(df.columns)

Index(['Review', 'Sentiment', 'cleaned_reviews'], dtype='object')

In [23]:
# Saving cleaned moviev reviews to csv
df.to_csv('cleaned_data.csv', index=False)

In [20]:
# joining tokens to create strings. TfidfVectorizer does not accept tokens as input
df['cleaned_reviews'] = [" ".join(row) for row in df['cleaned_reviews'].values] 

In [21]:
print(df.head())

Unnamed: 0,Review,Sentiment,cleaned_reviews
0,The acting in this movie was absolutely incred...,1,acting movie absolutely incredible blown away ...
1,I couldn't stand the plot of this movie. It fe...,0,stand plot movie felt like waste time money
2,The cinematography in this film was stunning. ...,1,cinematography film stunning could watch
3,I found the dialogue in this movie to be compl...,0,found dialogue movie completely unrealistic cr...
4,I laughed so hard during this movie. It was th...,1,laughed hard movie perfect comedy


In [22]:
data = df['cleaned_reviews']

In [23]:
Y = df['Sentiment'] # target column

## Case 1. (min_df=0.0264, ngram_range=(1,3)) --> 30 documents

In [24]:
# min_df=0.0264 means that each ngram (unigram, bigram, & trigram) must be present in at least 30 documents 
# for it to be considered as a token (1140*0.0264=30). 
tfidf = TfidfVectorizer(min_df=0.0264, ngram_range=(1,3)) 
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [25]:
#https://stackoverflow.com/questions/70215049/attributeerror-tfidfvectorizer-object-has-no-attribute-get-feature-names-out

In [26]:
print("The created tokens: \n", tfidf.get_feature_names()) #When sklearn.__version__ <= 0.24.x use get_feature_names()
print("Shape of tfidf matrix: ", data_tfidf.shape) #When sklearn.__version__ >= 1.0.x use get_feature_names_out()

The created tokens: 
 ['acting', 'acting movie', 'action', 'actor', 'added', 'away', 'boring', 'breathtaking', 'brought', 'character', 'cinematography', 'cinematography movie', 'completely', 'confusing', 'convoluted', 'dialogue', 'disappointed', 'disappointed movie', 'edge', 'edge seat', 'effect', 'end', 'entire', 'felt', 'felt like', 'film', 'follow', 'forgettable', 'found', 'found movie', 'hard', 'highly', 'humor', 'humor movie', 'incredible', 'incredibly', 'kept', 'laughing', 'left', 'like', 'loved', 'made', 'masterpiece', 'movie', 'outstanding', 'pacing', 'pacing slow', 'performance', 'performance movie', 'plot', 'predictable', 'really', 'recommend', 'scene', 'seat', 'slow', 'special', 'special effect', 'story', 'stunning', 'terrible', 'thought', 'throughout', 'time', 'uninteresting', 'unoriginal', 'wooden']
Shape of tfidf matrix:  (1140, 67)


## Implementing SVC

In [27]:

print("Implementing SVC.....")
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

Implementing SVC.....


In [28]:
# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)

Iteration  1
Cross-validation accuracy:  0.8947368421052632
Iteration  2
Cross-validation accuracy:  0.9210526315789473
Iteration  3
Cross-validation accuracy:  0.8596491228070176
Iteration  4
Cross-validation accuracy:  0.9649122807017544
Iteration  5
Cross-validation accuracy:  0.9649122807017544
Iteration  6
Cross-validation accuracy:  0.9473684210526315
Iteration  7
Cross-validation accuracy:  0.9736842105263158
Iteration  8
Cross-validation accuracy:  0.9210526315789473
Iteration  9
Cross-validation accuracy:  0.9035087719298246
Iteration  10
Cross-validation accuracy:  0.8596491228070176
Mean cross-validation accuracy:  0.9210526315789475


## Implementing NBC

In [29]:
print("Implementing NBC.....")
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

Implementing NBC.....


In [30]:
# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)

Iteration  1
Cross-validation accuracy:  0.868421052631579
Iteration  2
Cross-validation accuracy:  0.8947368421052632
Iteration  3
Cross-validation accuracy:  0.8508771929824561
Iteration  4
Cross-validation accuracy:  0.9122807017543859
Iteration  5
Cross-validation accuracy:  0.9122807017543859
Iteration  6
Cross-validation accuracy:  0.9122807017543859
Iteration  7
Cross-validation accuracy:  0.9210526315789473
Iteration  8
Cross-validation accuracy:  0.8859649122807017
Iteration  9
Cross-validation accuracy:  0.9035087719298246
Iteration  10
Cross-validation accuracy:  0.8508771929824561
Mean cross-validation accuracy:  0.8912280701754385


## SVC mean cross-validation accuracy:  0.9210526315789475
## NBC mean cross-validation accuracy:  0.8912280701754385

In [None]:
#### Saving predicted sentiment of tweets to csv
# df['predicted_sentiment'] = y_pred.reshape(-1,1)
# df.drop(['id', 'created_at'], axis=1, inplace=True)
# df.to_csv('predicted_sentiment.csv', index=False)

# 2. We are considering only unigrams and bigrams; then only unigrams.
## (min_df=0.0264, ngram_range=(1,2)) --> 30 documents
## (min_df=0.0264, ngram_range=(1,1)) --> 30 documents


In [31]:
tfidf = TfidfVectorizer(min_df=0.0264, ngram_range=(1,2)) 
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [32]:
print("The created tokens: \n", tfidf.get_feature_names()) #When sklearn.__version__ <= 0.24.x use get_feature_names()
print("Shape of tfidf matrix: ", data_tfidf.shape) #When sklearn.__version__ >= 1.0.x use get_feature_names_out()

The created tokens: 
 ['acting', 'acting movie', 'action', 'actor', 'added', 'away', 'boring', 'breathtaking', 'brought', 'character', 'cinematography', 'cinematography movie', 'completely', 'confusing', 'convoluted', 'dialogue', 'disappointed', 'disappointed movie', 'edge', 'edge seat', 'effect', 'end', 'entire', 'felt', 'felt like', 'film', 'follow', 'forgettable', 'found', 'found movie', 'hard', 'highly', 'humor', 'humor movie', 'incredible', 'incredibly', 'kept', 'laughing', 'left', 'like', 'loved', 'made', 'masterpiece', 'movie', 'outstanding', 'pacing', 'pacing slow', 'performance', 'performance movie', 'plot', 'predictable', 'really', 'recommend', 'scene', 'seat', 'slow', 'special', 'special effect', 'story', 'stunning', 'terrible', 'thought', 'throughout', 'time', 'uninteresting', 'unoriginal', 'wooden']
Shape of tfidf matrix:  (1140, 67)


In [33]:
print("Implementing SVC.....")
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)

Implementing SVC.....
Iteration  1
Cross-validation accuracy:  0.8947368421052632
Iteration  2
Cross-validation accuracy:  0.9210526315789473
Iteration  3
Cross-validation accuracy:  0.8596491228070176
Iteration  4
Cross-validation accuracy:  0.9649122807017544
Iteration  5
Cross-validation accuracy:  0.9649122807017544
Iteration  6
Cross-validation accuracy:  0.9473684210526315
Iteration  7
Cross-validation accuracy:  0.9736842105263158
Iteration  8
Cross-validation accuracy:  0.9210526315789473
Iteration  9
Cross-validation accuracy:  0.9035087719298246
Iteration  10
Cross-validation accuracy:  0.8596491228070176
Mean cross-validation accuracy:  0.9210526315789475


In [34]:
print("Implementing NBC.....")
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)

Implementing NBC.....
Iteration  1
Cross-validation accuracy:  0.868421052631579
Iteration  2
Cross-validation accuracy:  0.8947368421052632
Iteration  3
Cross-validation accuracy:  0.8508771929824561
Iteration  4
Cross-validation accuracy:  0.9122807017543859
Iteration  5
Cross-validation accuracy:  0.9122807017543859
Iteration  6
Cross-validation accuracy:  0.9122807017543859
Iteration  7
Cross-validation accuracy:  0.9210526315789473
Iteration  8
Cross-validation accuracy:  0.8859649122807017
Iteration  9
Cross-validation accuracy:  0.9035087719298246
Iteration  10
Cross-validation accuracy:  0.8508771929824561
Mean cross-validation accuracy:  0.8912280701754385


## Only unigrams now

In [35]:
tfidf = TfidfVectorizer(min_df=0.0264, ngram_range=(1,1)) 
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [36]:
print("The created tokens: \n", tfidf.get_feature_names()) #When sklearn.__version__ <= 0.24.x use get_feature_names()
print("Shape of tfidf matrix: ", data_tfidf.shape) #When sklearn.__version__ >= 1.0.x use get_feature_names_out()

The created tokens: 
 ['acting', 'action', 'actor', 'added', 'away', 'boring', 'breathtaking', 'brought', 'character', 'cinematography', 'completely', 'confusing', 'convoluted', 'dialogue', 'disappointed', 'edge', 'effect', 'end', 'entire', 'felt', 'film', 'follow', 'forgettable', 'found', 'hard', 'highly', 'humor', 'incredible', 'incredibly', 'kept', 'laughing', 'left', 'like', 'loved', 'made', 'masterpiece', 'movie', 'outstanding', 'pacing', 'performance', 'plot', 'predictable', 'really', 'recommend', 'scene', 'seat', 'slow', 'special', 'story', 'stunning', 'terrible', 'thought', 'throughout', 'time', 'uninteresting', 'unoriginal', 'wooden']
Shape of tfidf matrix:  (1140, 57)


In [37]:
print("Implementing SVC.....")
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)

Implementing SVC.....
Iteration  1
Cross-validation accuracy:  0.8947368421052632
Iteration  2
Cross-validation accuracy:  0.9210526315789473
Iteration  3
Cross-validation accuracy:  0.8596491228070176
Iteration  4
Cross-validation accuracy:  0.956140350877193
Iteration  5
Cross-validation accuracy:  0.9649122807017544
Iteration  6
Cross-validation accuracy:  0.9473684210526315
Iteration  7
Cross-validation accuracy:  0.9736842105263158
Iteration  8
Cross-validation accuracy:  0.9210526315789473
Iteration  9
Cross-validation accuracy:  0.9122807017543859
Iteration  10
Cross-validation accuracy:  0.8859649122807017
Mean cross-validation accuracy:  0.9236842105263158


In [38]:
print("Implementing NBC.....")
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)

Implementing NBC.....
Iteration  1
Cross-validation accuracy:  0.868421052631579
Iteration  2
Cross-validation accuracy:  0.8947368421052632
Iteration  3
Cross-validation accuracy:  0.868421052631579
Iteration  4
Cross-validation accuracy:  0.9035087719298246
Iteration  5
Cross-validation accuracy:  0.9210526315789473
Iteration  6
Cross-validation accuracy:  0.9122807017543859
Iteration  7
Cross-validation accuracy:  0.9210526315789473
Iteration  8
Cross-validation accuracy:  0.8771929824561403
Iteration  9
Cross-validation accuracy:  0.9035087719298246
Iteration  10
Cross-validation accuracy:  0.8596491228070176
Mean cross-validation accuracy:  0.8929824561403509


## 3. Let's try min_df=0.0527 and ngram_range=(1,3) 
# These values means that each ngram (unigram, bigram, & trigram) must be present in at least 60 documents for it to be considered as a token.

In [39]:
tfidf = TfidfVectorizer(min_df=0.0527, ngram_range=(1,3)) 
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [40]:
print("The created tokens: \n", tfidf.get_feature_names()) #When sklearn.__version__ <= 0.24.x use get_feature_names()
print("Shape of tfidf matrix: ", data_tfidf.shape) #When sklearn.__version__ >= 1.0.x use get_feature_names_out()

The created tokens: 
 ['acting', 'actor', 'character', 'cinematography', 'felt', 'film', 'found', 'found movie', 'humor', 'like', 'movie', 'pacing', 'performance', 'plot', 'really', 'story', 'time']
Shape of tfidf matrix:  (1140, 17)


In [41]:
print("Implementing SVC.....")
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)

Implementing SVC.....
Iteration  1
Cross-validation accuracy:  0.7894736842105263
Iteration  2
Cross-validation accuracy:  0.7105263157894737
Iteration  3
Cross-validation accuracy:  0.7894736842105263
Iteration  4
Cross-validation accuracy:  0.8333333333333334
Iteration  5
Cross-validation accuracy:  0.8508771929824561
Iteration  6
Cross-validation accuracy:  0.7807017543859649
Iteration  7
Cross-validation accuracy:  0.7982456140350878
Iteration  8
Cross-validation accuracy:  0.7543859649122807
Iteration  9
Cross-validation accuracy:  0.8157894736842105
Iteration  10
Cross-validation accuracy:  0.7368421052631579
Mean cross-validation accuracy:  0.7859649122807018


In [42]:
print("Implementing NBC.....")
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)

Implementing NBC.....
Iteration  1
Cross-validation accuracy:  0.7982456140350878
Iteration  2
Cross-validation accuracy:  0.7105263157894737
Iteration  3
Cross-validation accuracy:  0.7456140350877193
Iteration  4
Cross-validation accuracy:  0.8070175438596491
Iteration  5
Cross-validation accuracy:  0.8157894736842105
Iteration  6
Cross-validation accuracy:  0.7719298245614035
Iteration  7
Cross-validation accuracy:  0.7631578947368421
Iteration  8
Cross-validation accuracy:  0.7631578947368421
Iteration  9
Cross-validation accuracy:  0.8070175438596491
Iteration  10
Cross-validation accuracy:  0.7543859649122807
Mean cross-validation accuracy:  0.7736842105263158


# 4. We are considering only unigrams and bigrams; then only unigrams.
## (min_df=0.0527, ngram_range=(1,2)) --> 60 documents
## (min_df=0.0527, ngram_range=(1,1)) --> 60 documents


In [43]:
tfidf = TfidfVectorizer(min_df=0.0527, ngram_range=(1,2)) 
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [44]:
print("The created tokens: \n", tfidf.get_feature_names()) #When sklearn.__version__ <= 0.24.x use get_feature_names()
print("Shape of tfidf matrix: ", data_tfidf.shape) #When sklearn.__version__ >= 1.0.x use get_feature_names_out()

The created tokens: 
 ['acting', 'actor', 'character', 'cinematography', 'felt', 'film', 'found', 'found movie', 'humor', 'like', 'movie', 'pacing', 'performance', 'plot', 'really', 'story', 'time']
Shape of tfidf matrix:  (1140, 17)


In [45]:
print("Implementing SVC.....")
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)

Implementing SVC.....
Iteration  1
Cross-validation accuracy:  0.7894736842105263
Iteration  2
Cross-validation accuracy:  0.7105263157894737
Iteration  3
Cross-validation accuracy:  0.7894736842105263
Iteration  4
Cross-validation accuracy:  0.8333333333333334
Iteration  5
Cross-validation accuracy:  0.8508771929824561
Iteration  6
Cross-validation accuracy:  0.7807017543859649
Iteration  7
Cross-validation accuracy:  0.7982456140350878
Iteration  8
Cross-validation accuracy:  0.7543859649122807
Iteration  9
Cross-validation accuracy:  0.8157894736842105
Iteration  10
Cross-validation accuracy:  0.7368421052631579
Mean cross-validation accuracy:  0.7859649122807018


In [46]:
print("Implementing NBC.....")
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)

Implementing NBC.....
Iteration  1
Cross-validation accuracy:  0.7982456140350878
Iteration  2
Cross-validation accuracy:  0.7105263157894737
Iteration  3
Cross-validation accuracy:  0.7456140350877193
Iteration  4
Cross-validation accuracy:  0.8070175438596491
Iteration  5
Cross-validation accuracy:  0.8157894736842105
Iteration  6
Cross-validation accuracy:  0.7719298245614035
Iteration  7
Cross-validation accuracy:  0.7631578947368421
Iteration  8
Cross-validation accuracy:  0.7631578947368421
Iteration  9
Cross-validation accuracy:  0.8070175438596491
Iteration  10
Cross-validation accuracy:  0.7543859649122807
Mean cross-validation accuracy:  0.7736842105263158


## Unigrams only

In [47]:
tfidf = TfidfVectorizer(min_df=0.0527, ngram_range=(1,1)) 
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [48]:
print("The created tokens: \n", tfidf.get_feature_names()) #When sklearn.__version__ <= 0.24.x use get_feature_names()
print("Shape of tfidf matrix: ", data_tfidf.shape) #When sklearn.__version__ >= 1.0.x use get_feature_names_out()

The created tokens: 
 ['acting', 'actor', 'character', 'cinematography', 'felt', 'film', 'found', 'humor', 'like', 'movie', 'pacing', 'performance', 'plot', 'really', 'story', 'time']
Shape of tfidf matrix:  (1140, 16)


In [49]:
print("Implementing SVC.....")
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)

Implementing SVC.....
Iteration  1
Cross-validation accuracy:  0.7894736842105263
Iteration  2
Cross-validation accuracy:  0.7192982456140351
Iteration  3
Cross-validation accuracy:  0.7982456140350878
Iteration  4
Cross-validation accuracy:  0.8333333333333334
Iteration  5
Cross-validation accuracy:  0.8508771929824561
Iteration  6
Cross-validation accuracy:  0.7894736842105263
Iteration  7
Cross-validation accuracy:  0.8070175438596491
Iteration  8
Cross-validation accuracy:  0.7456140350877193
Iteration  9
Cross-validation accuracy:  0.8157894736842105
Iteration  10
Cross-validation accuracy:  0.7368421052631579
Mean cross-validation accuracy:  0.7885964912280702


In [51]:
print("Implementing NBC.....")
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)

Implementing NBC.....
Iteration  1
Cross-validation accuracy:  0.7982456140350878
Iteration  2
Cross-validation accuracy:  0.7192982456140351
Iteration  3
Cross-validation accuracy:  0.7807017543859649
Iteration  4
Cross-validation accuracy:  0.8070175438596491
Iteration  5
Cross-validation accuracy:  0.8333333333333334
Iteration  6
Cross-validation accuracy:  0.7894736842105263
Iteration  7
Cross-validation accuracy:  0.7719298245614035
Iteration  8
Cross-validation accuracy:  0.7719298245614035
Iteration  9
Cross-validation accuracy:  0.8157894736842105
Iteration  10
Cross-validation accuracy:  0.7543859649122807
Mean cross-validation accuracy:  0.7842105263157895


# The best accuracy is with SVC (min_df=0.0264, ngram_range=(1,1). 
We will save this model.

In [52]:
tfidf = TfidfVectorizer(min_df=0.0264, ngram_range=(1,1)) 
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [53]:
print("Implementing SVC.....")
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)

Implementing SVC.....
Iteration  1
Cross-validation accuracy:  0.8947368421052632
Iteration  2
Cross-validation accuracy:  0.9210526315789473
Iteration  3
Cross-validation accuracy:  0.8596491228070176
Iteration  4
Cross-validation accuracy:  0.956140350877193
Iteration  5
Cross-validation accuracy:  0.9649122807017544
Iteration  6
Cross-validation accuracy:  0.9473684210526315
Iteration  7
Cross-validation accuracy:  0.9736842105263158
Iteration  8
Cross-validation accuracy:  0.9210526315789473
Iteration  9
Cross-validation accuracy:  0.9122807017543859
Iteration  10
Cross-validation accuracy:  0.8859649122807017
Mean cross-validation accuracy:  0.9236842105263158


# Creating SVC on entire data and saving it for deployment

In [54]:
# Creating SVC on entire data and saving it
clf = LinearSVC().fit(data_tfidf, Y)
joblib.dump(clf, 'svc.sav')

['svc.sav']

In [55]:
# Let's export the vocabulary of the TF-IDF matrix
pd.DataFrame(pd.Series(tfidf.get_feature_names())).to_csv('review_vocabulary.csv', header=False, index=False)

In [56]:
model = joblib.load('svc.sav')

In [57]:
review_vocalbury =  pd.read_csv('review_vocabulary.csv', header=None)

In [58]:
vocabulary_dict = {}
for i, word in enumerate(review_vocalbury[0]):
      vocabulary_dict[word] = i
print(vocabulary_dict)
tfidf = TfidfVectorizer(vocabulary = vocabulary_dict,lowercase=False)

{'acting': 0, 'action': 1, 'actor': 2, 'added': 3, 'away': 4, 'boring': 5, 'breathtaking': 6, 'brought': 7, 'character': 8, 'cinematography': 9, 'completely': 10, 'confusing': 11, 'convoluted': 12, 'dialogue': 13, 'disappointed': 14, 'edge': 15, 'effect': 16, 'end': 17, 'entire': 18, 'felt': 19, 'film': 20, 'follow': 21, 'forgettable': 22, 'found': 23, 'hard': 24, 'highly': 25, 'humor': 26, 'incredible': 27, 'incredibly': 28, 'kept': 29, 'laughing': 30, 'left': 31, 'like': 32, 'loved': 33, 'made': 34, 'masterpiece': 35, 'movie': 36, 'outstanding': 37, 'pacing': 38, 'performance': 39, 'plot': 40, 'predictable': 41, 'really': 42, 'recommend': 43, 'scene': 44, 'seat': 45, 'slow': 46, 'special': 47, 'story': 48, 'stunning': 49, 'terrible': 50, 'thought': 51, 'throughout': 52, 'time': 53, 'uninteresting': 54, 'unoriginal': 55, 'wooden': 56}


## Predict reviews using new unlabelled dataset

In [59]:
# Reading new data as dataframe
df = pd.read_csv("Unlabelled.csv")
#pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
#pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window


In [60]:
print(df.head())

Unnamed: 0,review
0,The acting was superb and the storyline was ca...
1,"I thought the movie was terrible, with a convo..."
2,This film was a masterpiece of cinematography ...
3,"The special effects were impressive, but the p..."
4,"The character development was excellent, and t..."


In [61]:
df['cleaned_reviews'] = df.review.apply(cleaner)

In [62]:
df = df[df['cleaned_reviews'].map(len) > 0] # removing rows with cleaned reviews of length 0
print("Printing top 5 rows of dataframe showing original and cleaned reviews....")
print(df[['review','cleaned_reviews']].head())

Printing top 5 rows of dataframe showing original and cleaned reviews....
                                              review  \
0  The acting was superb and the storyline was ca...   
1  I thought the movie was terrible, with a convo...   
2  This film was a masterpiece of cinematography ...   
3  The special effects were impressive, but the p...   
4  The character development was excellent, and t...   

                                     cleaned_reviews  
0           [acting, superb, storyline, captivating]  
1  [thought, movie, terrible, convoluted, plot, t...  
2  [film, masterpiece, cinematography, storytelling]  
3       [special, effect, impressive, plot, lacking]  
4  [character, development, excellent, performanc...  


In [63]:
df['cleaned_reviews'] = [" ".join(row) for row in df['cleaned_reviews'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_reviews']
print(df['cleaned_reviews'].head())

0                  acting superb storyline captivating
1    thought movie terrible convoluted plot terribl...
2         film masterpiece cinematography storytelling
3               special effect impressive plot lacking
4    character development excellent performance st...
Name: cleaned_reviews, dtype: object


In [64]:
tfidf.fit(data)
data_tfidf = tfidf.transform(data)

In [65]:
y_pred = model.predict(data_tfidf)

#### Saving predicted sentiment of reviews to csv

In [67]:
#### Saving predicted sentiment of movie reviews to csv
df['predicted_sentiment'] = y_pred.reshape(-1,1)
df.to_csv('predicted_sentiment.csv', index=False)

## Let's check the model performance using the actual labels for the unlabelled dataset

In [80]:
Unlabelled_with_labels = pd.read_csv("Unlabelled_with_labels.csv")

In [81]:
Unlabelled_with_labels['sentiment'] = Unlabelled_with_labels['sentiment'].map({'positive':1, 'negative':0})
print(Unlabelled_with_labels.head())

Unnamed: 0,review,sentiment
0,The acting was superb and the storyline was ca...,1
1,"I thought the movie was terrible, with a convo...",0
2,This film was a masterpiece of cinematography ...,1
3,"The special effects were impressive, but the p...",0
4,"The character development was excellent, and t...",1


In [82]:
print(Unlabelled_with_labels.isna().sum())

review       0
sentiment    0
dtype: int64

In [83]:
# Extract the labels
y_actual = Unlabelled_with_labels['sentiment'].values

In [84]:
y_actual

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [86]:
#### Saving actual sentiment of movie reviews for comparism with the predictions to csv
df['actual_sentiment'] = y_actual.reshape(-1,1)
df.to_csv('sentiment_pred_actual.csv', index=False)

# Compute the accuracy using the unlabelled dataset and the actual sentiments

In [87]:
score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
print("The Accuracy using the deployed model with the unlabelled dataset is: ", score)

The Accuracy using the deployed model with the unlabelled dataset is:  0.8859649122807017


## # Implementing UMAP to visualize the labelled dataset

In [88]:
df = pd.read_csv("Labelled.csv",encoding='ISO-8859-1')
df['Sentiment'] = df['Sentiment'].map({'positive':1, 'negative':0})
df['cleaned_reviews'] = df.Review.apply(cleaner)
df = df[df['cleaned_reviews'].map(len) > 0] # removing rows with cleaned tweets of length 0
df['cleaned_reviews'] = [" ".join(row) for row in df['cleaned_reviews'].values] 
data = df['cleaned_reviews']

In [89]:
tfidf = TfidfVectorizer(min_df=0.0264, ngram_range=(1,3)) 
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [90]:
import umap
import plotly.graph_objs as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import umap
%matplotlib inline

In [91]:
print("Shape of tfidf matrix: ", data_tfidf.shape)

Shape of tfidf matrix:  (1140, 67)


In [166]:
print(df.columns)

Index(['Review', 'Sentiment', 'cleaned_reviews'], dtype='object')

In [123]:
import plotly.io as pio

In [124]:
u = umap.UMAP(n_neighbors=30, min_dist=0.3)
x_umap = u.fit_transform(data_tfidf)

sentiment = list(df['Sentiment'])
review = list(df['Review'])

data_ = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=df['Sentiment'], colorscale='Rainbow', opacity=0.5),
                                text=[f'Sentiment: {a}<br>News: {b}' for a,b in list(zip(sentiment, review))],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Dimensionality Reduction with n_neighbors=30, min_dist=0.3', width = 800, height = 800,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data_, layout=layout)

# Save the graph object as a PNG file locally
pio.write_image(fig, file='myplot.png')

fig.show()

## Let us explore two hyperparameters: n_neighbors and min_dist

#### To make exploration simpler we will first write a short utility function that can fit the data with UMAP given a set of parameter choices, and plot the result.

In [181]:
#https://umap-learn.readthedocs.io/en/latest/parameters.html

## Let's explore a range of n_neighbors values and see how they affect the visualization of the data

In [148]:
import os

def explore_umap_n_neighbors(n_neighbors=30, min_dist=0.3, title=''):
    u = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist)
    x_umap = u.fit_transform(data_tfidf)
    sentiment = list(df['Sentiment'])
    review = list(df['Review'])
    data_ = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=df['Sentiment'], colorscale='Rainbow', opacity=0.5),
                                text=[f'Sentiment: {a}<br>Review: {b}' for a,b in list(zip(sentiment, review))],
                                hoverinfo='text')]
    layout = go.Layout(title=title, width = 800, height = 800,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
    fig = go.Figure(data=data_, layout=layout)
    
    # create the images folder if it does not exist
    if not os.path.exists('images_nei'):
        os.makedirs('images_nei')
    
    # Save the graph object as a PNG file locally
    pio.write_image(fig, file=f"images_nei/umap_plot_n_neighbors_{n_neighbors}.png")
    
    fig.show()

In [150]:
for n in (2, 5, 10, 20, 30, 50, 100, 200, 285):
    explore_umap_n_neighbors(n_neighbors=n, title='n_neighbors = {}'.format(n))

## Explore a range of min_dist values

In [151]:
def explore_umap_min_dist(n_neighbors=30, min_dist=0.3, title=''):
    u = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist)
    x_umap = u.fit_transform(data_tfidf)
    sentiment = list(df['Sentiment'])
    review = list(df['Review'])
    data_ = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=df['Sentiment'], colorscale='Rainbow', opacity=0.5),
                                text=[f'Sentiment: {a}<br>Review: {b}' for a,b in list(zip(sentiment, review))],
                                hoverinfo='text')]
    layout = go.Layout(title=title, width = 800, height = 800,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
    fig = go.Figure(data=data_, layout=layout)
    
    # create the images folder if it does not exist
    if not os.path.exists('images_min'):
        os.makedirs('images_min')
    
    # Save the graph object as a PNG file locally
    pio.write_image(fig, file=f"images_min/umap_plot_min_dist_{min_dist}.png")
    
    fig.show()

In [152]:
for d in (0.0, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9):
    explore_umap_min_dist(min_dist=d, title='min_dist = {}'.format(d))

## THE END