Importing the necessary dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords


Loading the dataset and reviewing it

In [None]:
df = pd.read_csv('Shuffled_CSV.csv')

In [None]:
df

In [None]:
print(df.shape)

In [None]:
#Print the header
Header = df.head(5)
print(Header)

Doing some EDA

In [None]:
# check for null values
null_Values = df.isnull().sum()
print(null_Values)

In [None]:
#Finding the languages 
from langdetect import detect
for i in range (len(df["text"])):
    language = detect(df['text'][i])
    if language != 'en':
        print(language)
        print(df['text'][i])
        print(i)
        df.drop(i, inplace=True)
        

In [None]:
df.reset_index(drop=True, inplace=True)


In [None]:
df['text'][0]

In [None]:
df

In [None]:
nltk.download('stopwords')
",".join(stopwords.words("english"))
stop_words = set(stopwords.words('english'))

In [None]:
# Define the remove_stop_words function
def remove_stop_words(x):
    return " ".join([word for word in str(x).split() if word.lower() not in stop_words])

# Apply the function to create a new column 'filtered_text'
df['filtered_text'] = df['text'].apply(lambda x: remove_stop_words(x))

In [None]:
df['filtered_text'][0]

In [None]:
duplicates = df.duplicated().sum()
print(duplicates)


Visualizing the dataset

In [None]:
Bar_plot = df["stars"].value_counts().sort_index() \
    .plot(kind='bar',
           title="Count of reviews by stars",
           figsize=(10,5))

Bar_plot.set_xlabel('Review Stars')
Bar_plot.set_ylabel('Number of reviews')
plt.show()

In [None]:
#2 for positive, 1 for neutral, 3 for negative
df["Reviews"] = df["stars"].apply(lambda score: 2 if score >= 4 else 1 if score == 3 else 0)

In [None]:
df["Reviews"].value_counts()

In [None]:
New_DataFrame = df[['filtered_text','Reviews']]

In [None]:
Bar_plot = df["Reviews"].value_counts().sort_index() \
    .plot(kind='bar',
           title="Comparission of postive and negative reviews",
           figsize=(10,5))

Bar_plot.set_xlabel('Type of the review')
Bar_plot.set_ylabel('Number of reviews')
plt.show()

In [None]:
New_DataFrame.head(5)

Splitting the dataset as Train and Test

In [None]:
# Splitting the dataset into training and testing datasets
train_Text_Data, test_Text_Data, train_label_Data, test_labels_Data = train_test_split(New_DataFrame['filtered_text'], New_DataFrame['Reviews'], test_size=0.2,stratify=New_DataFrame["Reviews"], random_state=1)

In [None]:
print(train_Text_Data.size)
print(test_labels_Data.size)

In [None]:
#Loading the Pre-Trained model with the tokenizer
model_Name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = AutoModelForSequenceClassification.from_pretrained(model_Name)
tokenizer = AutoTokenizer.from_pretrained(model_Name)

In [None]:
# Tokenizing the input text data
train_text_encodings = tokenizer(train_Text_Data.tolist(), truncation=True, padding=True)
test_text_encodings = tokenizer(test_Text_Data.tolist(), truncation=True, padding=True)

In [None]:
# Convert labels and encodings in dataset objects
train_Dataset_Object = TensorDataset(torch.tensor(train_text_encodings['input_ids']),
                              torch.tensor(train_text_encodings['attention_mask']),
                              torch.tensor(train_label_Data.tolist()))
test_Dataset_Object = TensorDataset(torch.tensor(test_text_encodings['input_ids']),
                             torch.tensor(test_text_encodings['attention_mask']),
                             torch.tensor(test_labels_Data.tolist()))

In [None]:
# Creating the train loaders using the dataset objects
train_Data_Loader = DataLoader(train_Dataset_Object, batch_size=4, shuffle=True)
test_Data_Loader = DataLoader(test_Dataset_Object, batch_size=4, shuffle=False)

In [None]:
# Setting the device for GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

In [None]:
# Training hyperparameters
epochs = 1
optimizer = torch.optim.AdamW(model.parameters(),
                              weight_decay=0.01,
                              lr=2e-5)

In [None]:
# Training loop
train_losses = []
start_time = time.time()

for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    # Creating a progress bar for batches
    progress_bar = tqdm(enumerate(train_Data_Loader, 1), total=len(train_Data_Loader), desc=f'Epoch {epoch + 1}/{epochs}')

    for batch_idx, batch in progress_bar:
        input_ids, attention_mask, Reviews = map(lambda x: x.to(device), batch)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=Reviews)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Calculating average training loss for each epoch
        avg_epoch_loss = epoch_loss / batch_idx
        train_losses.append(avg_epoch_loss)

        progress_bar.set_postfix({'Training Loss': avg_epoch_loss})

    # Calculate and display the total training time at the end of each epoch
    elapsed_Total_Time = time.time() - start_time
    total_minutes = elapsed_Total_Time / 60
    print(f'Total Training Time for Epoch {epoch + 1}: {elapsed_Total_Time:.2f} minutes')


In [None]:
# Save the trained model to the local pc
model.save_pretrained('Models')
tokenizer.save_pretrained('Tokens')

print("Model saved......")

Loading the model to do predictions

In [None]:
# Loading the saved fine-tuned model to get predictions
tokenizer = AutoTokenizer.from_pretrained('Tokens')
model = AutoModelForSequenceClassification.from_pretrained('Models')

# Setting the device for GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

def get_predictions(review):
    # Tokenizing the user review
    input_Data = tokenizer(review,truncation=True, padding=True,return_tensors='pt',)
    input_ids = input_Data['input_ids'].to(device)
    attention_mask = input_Data['attention_mask'].to(device)

    # Getting the model prediction according to the user review
    with torch.no_grad():
        Prediction = model(input_ids, attention_mask=attention_mask)

    # Getting the predicted labels
    logits = Prediction.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    return predicted_class


Testing the model

In [None]:
#Getting user reviews
user_Review1 = "Wow that is delicious"

In [None]:
review_Prediction = get_predictions(user_Review1)
if review_Prediction == 2:
    print("Positive Feedback.")
elif review_Prediction == 0:
    print("Negative Feedback.")
else:
    print("Neutral Feedback")

In [None]:
#Getting user reviews 2
user_Review2 = "Too spicy for my taste"

In [None]:
review_Prediction = get_predictions(user_Review2)
if review_Prediction == 2:
    print("Positive Feedback.")
elif review_Prediction == 0:
    print("Negative Feedback.")
else:
    print("Neutral Feedback")

In [None]:
#Getting user reviews 3
user_Review3 = "Exceptional service"

In [None]:
review_Prediction = get_predictions(user_Review3)
if review_Prediction == 2:
    print("Positive Feedback.")
elif review_Prediction == 0:
    print("Negative Feedback.")
else:
    print("Neutral Feedback")

In [None]:
#Getting user reviews 4
user_Review4 = "Causal atmosphere, average food"

In [None]:
review_Prediction = get_predictions(user_Review4)
if review_Prediction == 2:
    print("Positive Feedback.")
elif review_Prediction == 0:
    print("Negative Feedback.")
else:
    print("Neutral Feedback")

In [None]:
#Getting user reviews 5
user_Review5 = "Music was too loud"

In [None]:
review_Prediction = get_predictions(user_Review5)
if review_Prediction == 2:
    print("Positive Feedback.")
elif review_Prediction == 0:
    print("Negative Feedback.")
else:
    print("Neutral Feedback")

Evaluations of the model

In [None]:
# Evaluating the model
model.eval()
review_Predictions = []
ground_Truth = []

with torch.no_grad():
    for batch in test_Data_Loader:
        input_ids, attention_mask, Reviews = map(lambda x: x.to(device), batch)
        prediction_Results = model(input_ids, attention_mask=attention_mask)
        logits = prediction_Results.logits
        review_Predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        ground_Truth.extend(Reviews.cpu().numpy())


In [None]:
test_Data_Accuracy = accuracy_score(ground_Truth, review_Predictions)
print(f"The test data accuracy is : {test_Data_Accuracy}")

In [None]:
# Calculating the cunfusion matrix
confusion = confusion_matrix(ground_Truth, review_Predictions)

In [None]:
# PLotting the confusion matrix in a heat map
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='d', cmap='viridis', xticklabels=['Negative Reviews',"Neutral Reviews", 'Positive Reviews'],yticklabels=['Negative Reviews',"Neutral Reviews", 'Positive Reviews'])
plt.title('Model Results')
plt.xlabel('Predicted Reviews')
plt.ylabel('Ground Truth')

plt.show()

In [None]:
print(classification_report(ground_Truth, review_Predictions,target_names=['Negative reviews','Neutral Reviews', 'Postive reviews']))