# Coursework 1: Train a Sentiment Analysis Classifier
In this project, I have trained a sentiment analysis classifier for movie reviews. The sample code below builds a simple classifier that uses tf-idf to vectorize text and a logistic regression model to make predictions.

# Load the appropriate libraries

In [1]:
# load data and take a quick look
import pandas as pd
import numpy as np
import re, string, nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.preprocessing import MaxAbsScaler
from nltk.stem import PorterStemmer
from sklearn.metrics import precision_recall_fscore_support,accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
import warnings
warnings.filterwarnings("ignore")

# Creating the functions that are responsible for the cleaning

in the first step, I have created two functions. The first one will take the data that we have and will apply some preprocessing techniques to clean it. basically, in the first line, I am creating a copy of the data where always when we have data we need to create copy and work on the copy one to preserve the original data from being corrupted. after that, I have defined the patterns I am looking for for preprocessing the data. some words have upper case so by using .strip and lower() function i am converting the line to a lower case one. After that, some of them have opostrophes that needs to be cleaned as well that's when i use the handle function that is being created later that handles all the words that have opostrophes into regular words ex: shouldn't to should not. After that, I have to do two things which is removing the punctuation that can be done using string.punctuation and the stopwrods.

the second function which is straightforwd i am creating the lemmatization function and returns the processed data

In [2]:
# cleaning text and preprocess data
def preprocessor(data):
    
    #creating a copy of the data to preserve the original data
    data_copy = data
    
    #the pattern we are looking for
    pattern1 = '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
    pattern2 = r'[^a-zA-Z\s]'
    
    #processing every line using the pattern set
    data_copy = [re.sub(re.compile(pattern1), '', new_line) for new_line in data_copy]
    data_copy = [re.sub(pattern2, ' ', new_line) for new_line in data_copy]
    
    #stripping white space and also after that converting the words into lowe case
    lower_case_conversion = [new_line.strip().lower() for new_line in data_copy]
    
    # in here replaceing apostrophes that we have with words
    processing = []
    for new_line in lower_case_conversion:
        new_line = new_line.replace("-", " ")
        handled_word = [handle[new_word] if new_word in handle else new_word for new_word in new_line.split()]
        processing.append(" ".join(handled_word))
        
    #also punctuations needs to be removed so this line will do so
    processing = [new_line.translate(str.maketrans('', '', string.punctuation)) for new_line in processing] 
    
    # finally, stopwrods are being removed using the join function
    stops = set(stopwords.words("english"))
    processing = [" ".join([new_word for new_word in new_line.split() if new_word not in stops]) for new_line in processing]
    
    #returning the fully processed text
    return processing

def lemmatize(data):
    
    #creating copy of the data
    data_copy = data
    
    # lemmatization process
    lemmatizer=WordNetLemmatizer()
    processed_data = [" ".join([lemmatizer.lemmatize(new_word) for new_word in new_line.split()]) for new_line in data_copy]
    
    #return the processed lemmitization data
    return processed_data

# Creating the handle function

as stated before, there are some words with opostrophes ( hasen't,aren't) these words needs to be handled so I have created a variable for it which contains almost all, if I did not forget any, words that has opostrophes that needs to be handled ex: he'd to he would

In [3]:
#some words needs to be handled like: hasn't to has not so this is a full list of all the word of the same form that needs to be
#handled
handle = {
"hasn\'t" : "has not",
"he\'s" : "he is",
"mightn\'t" : "might not",
"aren\'t" : "are not",
"can\'t" : "cannot",
"couldn\'t" : "could not",
"didn\'t" : "did not",
"haven\'t" : "have not",
"he\'d" : "he would",
"he\'ll" : "he will",
"doesn\'t" : "does not",
"don\'t" : "do not",
"hadn\'t" : "had not",
"i\'d" : "I would",
"i\'d" : "I had",
"i\'ll" : "I will",
"mustn\'t" : "must not",
"shan\'t" : "shall not",
"she\'d" : "she would",
"she\'ll" : "she will",
"i\'m" : "I am",
"isn\'t" : "is not",
"it\'s" : "it is",
"let\'s" : "let us",
"she\'s" : "she is",
"shouldn\'t" : "should not",
"that\'s" : "that is",
"we\'d" : "we would",
"we\'re" : "we are",
"weren\'t" : "were not",
"we\'ve" : "we have",
"there\'s" : "there is",
"they\'d" : "they would",
"it\'ll":"it will",
"i\'ve" : "I have",
"\'re": " are",
"wasn\'t": "was not",
"we\'ll":" will",
"they\'ll" : "they will",
"they\'re" : "they are",
"they\'ve" : "they have",
"what\'ll" : "what will",
"what\'re" : "what are",
"who\'d" : "who would",
"who\'ll" : "who will",
"who\'re" : "who are",
"who\'s" : "who is",
"who\'ve" : "who have",
"what\'s" : "what is",
"what\'ve" : "what have",
"where\'s" : "where is",
"won\'t" : "will not",
"wouldn\'t" : "would not",
"you\'d" : "you would",
"you\'ll" : "you will",
"you\'re" : "you are",
"you\'ve" : "you have",
"didn\'t": "did not"
}

# Reading data

In [4]:
#getting the data from the datafile and displaying the first 5 rows
raw = pd.read_csv('coursework1_train.csv')
raw.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,Enjoy the opening credits. They're the best th...,neg
1,1,"Well, the Sci-Fi channel keeps churning these ...",neg
2,2,It takes guts to make a movie on Gandhi in Ind...,pos
3,3,The Nest is really just another 'nature run am...,neg
4,4,Waco: Rules of Engagement does a very good job...,pos


# cleaning table

as we can see when we called the table, there is a column that has unnamed: 0 and another that is sentiment. sentiment is very important because this is the column that will run our learning model on but the other one needs to be deleted so in the first line i am trying to get rid of null values then I have deleted the unnamed: 0 column with the del functionality then the third one the sentiment because its not in a binary form ( number form ) its pos/neg i have transformed it into numbers where positive is 1 and negative is 0

In [5]:
#in here i am trying to check if there are any null data
raw.isnull().sum()

#deleting unnamed 0 if any 
if('Unnamed: 0' in raw.columns):
    del raw['Unnamed: 0']

raw.sentiment.value_counts()

#renaming the values of the sentiment from pos/neg to 1/0
raw['sentiment'] = raw.sentiment.map(lambda x: 1 if x =='pos' else 0)

In [6]:
# check the size of the data and its class distribution
all_text = raw['text'].tolist()
all_lables = raw['sentiment'].tolist()

print('entry num', len(all_text))
print('num of pos entries', len([l for l in all_lables if l==1]))
print('num of neg entries', len([l for l in all_lables if l==0]))

entry num 40000
num of pos entries 20000
num of neg entries 20000


# Processing data

in this step, its just giving the data ( all_text ) to the two functions that I have created earlier for preprocessing

In [7]:
#now using the preprocessing function created with the lemmatization
data_processed = preprocessor(all_text)
data_processed = lemmatize(data_processed)

# Returning values

after applying the functions, I have returned the values to the original data 

In [8]:
#adding the processed data to the raw data as text
raw['text'] = data_processed

# Splitting data

pretty straightforwd plitting the data into testing and training based on the text and the sentiment where the test size will be 25% of the data and the training data is 75%, and the random state is dd/mm

In [9]:
#splitting the data into training and testing while setting the random state as dd/mm
X_train, X_test, y_train, y_test = train_test_split(raw['text'], raw['sentiment'], test_size=0.25, random_state=408)

In [10]:
# tf-idf initialization
train_vectorizer = TfidfVectorizer(max_features=5000,use_idf=True)
train_vecs = train_vectorizer.fit_transform(X_train)
test_vecs = TfidfVectorizer(max_features=5000,vocabulary=train_vectorizer.vocabulary_).fit_transform(X_test)

tfidf_feature_names = train_vectorizer.get_feature_names()

In [11]:
count_vect = CountVectorizer(max_features=5000)
X_train_vectorized = count_vect.fit_transform(X_train)
X_test_vectorized = CountVectorizer(max_features=5000,vocabulary=count_vect.vocabulary_).fit_transform(X_test)

cv_feature_names = count_vect.get_feature_names()

# Hyperparameter tuning

after creating and initializing the model, we need to tune it using couple of tuning techniques and params so this is where this step comes in

In [12]:
#in here, creating the log hyperparameters dictionary with the penalty which is l1 l2 then creating the log classifier with
#gridsearchvs
log_hyperparameters = dict(C=[0.001, 0.01, 0.1, 1, 10, 100, 1000], penalty=['l1', 'l2'])
log_clf = GridSearchCV(LogisticRegression(), log_hyperparameters, cv=10, verbose=0)

# Training the model

First, I have chose LogicsticRegression as my model and I fit the vectorized data with the y_train in the logistics regression model. After that, I am choosing the best C and best penalty for the model. when this steps are done, I have created another classifier with the corresponding C and penalty to fit the data again and after that I have predicted the model accuracy according to it. The final thing I did was creating the confusion matrix and getting a classification report

In [13]:
# training: tf-idf + logistic regression
# train model

model = log_clf.fit(train_vecs, y_train)
penalty = model.best_estimator_.get_params()['penalty']
c = model.best_estimator_.get_params()['C']
print('the Penalty to choose :', penalty)
print('the C to choose for battle:', c)

clf = LogisticRegression(C=c,penalty=penalty).fit(train_vecs, y_train)

# test model
prediction_test_value = clf.predict(test_vecs)

acc = accuracy_score(y_test, prediction_test_value)
print('the accuracy', acc)
# confusion matrix
print(confusion_matrix(y_test, prediction_test_value))

# classification_report
print(classification_report(y_test, prediction_test_value))

the Penalty to choose : l2
the C to choose for battle: 1
the accuracy 0.8833
[[4368  624]
 [ 543 4465]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      4992
           1       0.88      0.89      0.88      5008

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



# Getting the weight classification

in this step, I have classified some weights based on how well they do like worst,waste,perfect... so I have used the enumerate to get the weights then sort them using the sort functionality and return them in a dataframe format

In [14]:
weights = {}
for (index,weight) in enumerate(tfidf_feature_names):
    weights[weight] = abs(clf.coef_[0][index])

sorted_weights = {key: value for key, value in sorted(weights.items(), key=lambda item: item[1], reverse = True)}

df = pd.DataFrame([sorted_weights])
df.iloc[:,0:10]

Unnamed: 0,worst,waste,bad,awful,excellent,great,boring,terrible,poor,perfect
0,9.479449,7.38444,7.332046,6.988097,6.753663,6.585347,6.350889,5.297323,5.229483,5.104207


# Creating another model

in here, I am creating another one using countvectorizer with logistic regression where before it was tf-id with logistic regression. same steps applies with the steps and everything

In [15]:
# training: CountVectorizer + logistic regression
# train model

model = log_clf.fit(train_vecs, y_train)
penalty = model.best_estimator_.get_params()['penalty']
c = model.best_estimator_.get_params()['C']
print('the best penalty:', penalty)
print('the best C to choose for battle:', c)

clf = LogisticRegression(C=c,penalty=penalty).fit(X_train_vectorized, y_train)

# test model
prediction_test_value = clf.predict(X_test_vectorized)

acc = accuracy_score(y_test, prediction_test_value)
print('the accuracy is:', acc)
# confusion matrix
print(confusion_matrix(y_test, prediction_test_value))

# classification_report
print(classification_report(y_test, prediction_test_value))

the best penalty: l2
the best C to choose for battle: 1
the accuracy is: 0.8655
[[4309  683]
 [ 662 4346]]
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      4992
           1       0.86      0.87      0.87      5008

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [16]:
weights = {}
for (index,weight) in enumerate(cv_feature_names):
    weights[weight] = abs(clf.coef_[0][index])

sorted_weights = {key: value for key, value in sorted(weights.items(), key=lambda item: item[1], reverse = True)}

df = pd.DataFrame([sorted_weights])
df.iloc[:,0:10]

Unnamed: 0,waste,forgettable,fails,refreshing,sensitive,worst,disappointment,notch,mildly,unwatchable
0,2.354888,2.232165,2.147537,2.03489,2.008938,1.997186,1.981454,1.977147,1.965383,1.961699


# Naive bayes with tf-idf

as the title suggest, I have create another classifier with Naive bayes and tf-id

In [17]:
# training: tf-idf + Naive Bayes
# train model
clf = naive_bayes.MultinomialNB()
clf.fit(train_vecs, y_train)

# test model
prediction_test_value = clf.predict(test_vecs)
acc = accuracy_score(y_test, prediction_test_value)
print('the accuracy is:', acc)

# confusion matrix
print(confusion_matrix(y_test, prediction_test_value))

# classification_report
print(classification_report(y_test, prediction_test_value))

the accuracy is: 0.8519
[[4232  760]
 [ 721 4287]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4992
           1       0.85      0.86      0.85      5008

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# CountVectorizer with Naive Bayes

in here, I have used CountVectorizer with Naive Bayes

In [18]:
# training: CountVectorizer + Naive Bayes
# train model

clf = naive_bayes.MultinomialNB()
clf.fit(X_train_vectorized, y_train)

# test model
prediction_test_value = clf.predict(X_test_vectorized)
acc = accuracy_score(y_test, prediction_test_value)
print('the accuracy is:', acc)

# confusion matrix
print(confusion_matrix(y_test, prediction_test_value))

# classification_report
print(classification_report(y_test, prediction_test_value))

the accuracy is: 0.8441
[[4237  755]
 [ 804 4204]]
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      4992
           1       0.85      0.84      0.84      5008

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



## SAVE YOUR TRAINED MODEL


In [19]:
import pickle

# save model and other necessary modules
all_info_want_to_save = {
    'model': clf,
    'vectorizer': TfidfVectorizer(max_features=5000,vocabulary=train_vectorizer.vocabulary_),
    'negate_handle' : handle,
    'feature_names' : tfidf_feature_names
}
with open("sample_trained_model.pickle", "wb") as f:
    pickle.dump(all_info_want_to_save, f)