<a href="https://colab.research.google.com/github/myingineer/sentiment_analysis_with_NLP/blob/main/sentiment_analysis_with_NLP_A_S_Emeka_Akam_4242769.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

Reading the CSV

In [2]:
train_df = pd.read_csv('/content/Train.csv')
test_df = pd.read_csv('/content/Test.csv')

In [3]:
train_df.shape

(40000, 2)

PRE-PROCESSING STEPS
- Text Cleaning (Removing stop words, html e.t.c.)
- Vectorization

In [4]:
# TEXT CLEANING

import re # to match regular expression
import nltk # natural language toolkit
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import stopwords # words that are to be excluded
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [6]:

def clean_text(text):
  '''
    This function uses Regular Expressions to clean the sentences and apply stemming to the words
    that are not part of the stop words in the library in order to reduce the size of the vocabulary.

    Input: text (str)
    Output: tokens (str)
  '''

  text = re.sub(r'<.*?>', '', text) # remove the HTML tags
  text = re.sub(r'http\S+|http\s+', '', text)  # remove URLs (normal or incorrect urls with spaces)
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # remove non-alphanumeric characters except spaces
  text = re.sub(r'\s+', ' ', text) # remove multiple whitespaces
  text = text.strip() # trim leading white spaces

  tokens = text.lower().split() # converts the texts to lowercase then splits each individual word
  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # applies stemming to words that are not part of the stop words
  return ' '.join(tokens)

In [7]:
def preprocess_corpus(texts):
  '''
    This function takes a sentence and calls the clean_text function to clean it.
    It then returns the cleaned sentence as a list

    Input: sentence (str)
    Output: cleaned_sentence (str)
  '''
  return [clean_text(text) for text in texts]

In [8]:
# VECTORIZATION

from sklearn.feature_extraction.text import TfidfVectorizer


def vectorize_texts(texts):
  '''
    This function vectorizes the texts and returns the shape of the
    vectorized texts and the vectorizer object.

    Input: texts (str)
    Output: X (array), vectorizer (object)
  '''
  vectorizer = TfidfVectorizer()
  X = vectorizer.fit_transform(texts)
  return X, vectorizer

MODELLING.
In this section, we perform three things,
  - train the model
  - test the model
  - evaluate the model

In [9]:
"""
  Training dataset
  The training dataset has a shape of (40000, 2)
"""

# PREPARING THE TRAIN DATASET
X_train = train_df['text'] # text column as the input
X_train = preprocess_corpus(X_train) # preprocessing the input
X_train, vectorizer = vectorize_texts(X_train) # vectorizing the input

y_train = train_df['label'] # labek column as the output

In [10]:
"""
  Testing data set
  The testing dataset has a shape of (5000, 2)
"""

# PREPARING THE TESTING DATASET
X_test = test_df['text'] # text column as the input
X_test = preprocess_corpus(X_test) # preprocessing the input
X_test = vectorizer.transform(X_test) # vectorizing the input

y_test = test_df['label'] # label column as the output

I would be usin different classification algorithms, to find out which one performs the best

In [11]:
'''
  The Algorithm used here is Linear Support Vector Classification
'''

# imports for linearSVC
from sklearn.svm import LinearSVC

# Training the model
svm = LinearSVC()
svm.fit(X_train, y_train)

# Testing the model
y_pred_svm = svm.predict(X_test)

# Evaluating the model
from sklearn.metrics import confusion_matrix, classification_report
print("The confusion matrix for the Linear SVC algorithm is: " + "\n" + str(confusion_matrix(y_test, y_pred_svm)))
print("\n" + "The classification report for the Linear SVC algorithm is: " + "\n" + str(classification_report(y_test, y_pred_svm)))

The confusion matrix for the Linear SVC algorithm is: 
[[2222  273]
 [ 233 2272]]

The classification report for the Linear SVC algorithm is: 
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2495
           1       0.89      0.91      0.90      2505

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



In [12]:
"""
  The algorithm used here is Multinomial Naive Bayes
"""

# Imports for Multinomial NB
from sklearn.naive_bayes import MultinomialNB

# Training the model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Testing the model
y_pred_mnb = mnb.predict(X_test)

# Evaluating the model
print("The confusion matrix for the Multinomial NB algorithm is: " + "\n" + str(confusion_matrix(y_test, y_pred_mnb)))
print("\n" + "The classification report for the Multinomial NB algorithm is: " + "\n" + str(classification_report(y_test, y_pred_mnb)))

The confusion matrix for the Multinomial NB algorithm is: 
[[2196  299]
 [ 359 2146]]

The classification report for the Multinomial NB algorithm is: 
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2495
           1       0.88      0.86      0.87      2505

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



In [13]:
"""
  The algorithm used here is Logistic Regression
"""

# Imports
from sklearn.linear_model import LogisticRegression

# Training the model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Testing the model
y_pred_lr = lr.predict(X_test)

# Evaluating the model
print("The confusion matrix for the Logistic Regression algorithm is: " + "\n" + str(confusion_matrix(y_test, y_pred_lr)))
print("\n" + "The classification report for the Logistic Regression algorithm is: " + "\n" + str(classification_report(y_test, y_pred_lr)))

The confusion matrix for the Logistic Regression algorithm is: 
[[2212  283]
 [ 241 2264]]

The classification report for the Logistic Regression algorithm is: 
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      2495
           1       0.89      0.90      0.90      2505

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



In [27]:
import joblib

# Save the pre-trained models
joblib.dump(svm, 'svm_model.pkl')
joblib.dump(mnb, 'mnb_model.pkl')
joblib.dump(lr, 'lr_model.pkl')

# Save the trained vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

Here we use the model to make future predictions

In [19]:
# Load the model
model = joblib.load('/content/svm_model.pkl')

# Load the vectorizer
vectorizer = joblib.load('/content/vectorizer.pkl')

def predict_sentiment(text):
    """
    Predict the sentiment of a given text using a pre-trained SVM model.

    Args:
        text (str): The input text to analyze.

    Returns:
        str: The predicted sentiment label ('positive' or 'negative').
    """
    # Preprocess the input text
    cleaned_text = preprocess_corpus([text])

    # Transform the cleaned text using the same vectorizer used during training
    vectorized_text = vectorizer.transform(cleaned_text)

    # Make a prediction using the pre-trained SVM model
    prediction = model.predict(vectorized_text)

    # Map the prediction to sentiment labels
    sentiment_label = 'positive' if prediction[0] == 1 else 'negative'

    return sentiment_label

In [26]:
test = 'i am satisfied with the movie'
print(predict_sentiment(test))

positive
