# Training the Model

In [3]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.6 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.6 kB ? eta -:--:--
     -------------------------------------- 60.6/60.6 kB 811.9 kB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.4.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.2-cp311-cp311-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.1/10.6 MB 3.4 MB/s eta 0:00:04
    --------------------------------------- 0.2/10.6 MB 3.0 MB/s eta 0:00:04
   - -------------------------------------- 0.3/10.6 MB 3.2 MB/s eta 0:00:04


In [1]:
!pip install scipy joblib



In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the data into a DataFrame
df = pd.read_csv('training_dataset.csv')

# Define text preprocessing and TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')

# Extract features (text data) and target variable
X_text = df['review_text']
X_other = df.drop(columns=['review_text', 'Target'])  # Other features
y = df['Target']  # Target variable

# Split the data into training and testing sets
X_text_train, X_text_test, X_other_train, X_other_test, y_train, y_test = train_test_split(X_text, X_other, y, test_size=0.2, random_state=42)

# Preprocess and vectorize the text data
X_text_train_tfidf = tfidf_vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = tfidf_vectorizer.transform(X_text_test)

# Combine text features with other features
import scipy.sparse as sp
X_train = sp.hstack([X_other_train.values, X_text_train_tfidf])
X_test = sp.hstack([X_other_test.values, X_text_test_tfidf])

# Train a logistic regression model
model = LogisticRegression(multi_class='auto', max_iter=10000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.6621173748553919
Classification Report:
              precision    recall  f1-score   support

           1       0.60      0.35      0.44     24427
           2       0.68      0.85      0.76     39539

    accuracy                           0.66     63966
   macro avg       0.64      0.60      0.60     63966
weighted avg       0.65      0.66      0.64     63966



In [7]:
from joblib import dump, load

# Assuming your logistic regression model is named as `model`
dump(model, 'final_model.joblib')

['final_model.joblib']

In [8]:
# Save the TF-IDF vectorizer model to a file
dump(tfidf_vectorizer, 'tfidf_vectorizer_model.joblib')

['tfidf_vectorizer_model.joblib']

In [9]:
# Load the logistic regression model
model = load('final_model.joblib')
# Load the TF-IDF vectorizer model
tfidf_vectorizer = load('tfidf_vectorizer_model.joblib')

# Load the new dataset into a DataFrame
new_df = pd.read_csv('test_dataset.csv')

# Preprocess and vectorize the text data in the new dataset
X_text_new = new_df['review_text']
X_text_new_tfidf = tfidf_vectorizer.transform(X_text_new)

# Combine text features with other features in the new dataset
X_new = sp.hstack([new_df.drop(columns=['review_text', 'Target']).values, X_text_new_tfidf])

# Use the trained logistic regression model to predict the target variable for the new dataset
y_pred_new = model.predict(X_new)

# Add the predicted target column to the new dataset
new_df['Predicted_Target'] = y_pred_new

# Optionally, save the new dataset with the predicted target column to a CSV file
new_df.to_csv('new_dataset_predicted.csv', index=False)

In [10]:
new_df.head()

Unnamed: 0,review_text,Word_Count,Slang_Count,Character_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count,Target,Predicted_Target
0,I enjoy vintage books and movies so I enjoyed ...,57,0,240,6,0,0,5,3,1,49,1,2
1,This book is a reissue of an old one; the auth...,118,0,375,36,0,0,11,5,3,72,2,2
2,This was a fairly interesting read. It had ol...,72,0,311,15,0,0,7,6,1,55,2,2
3,I'd never read any of the Amy Brewster mysteri...,23,0,81,4,0,0,5,0,0,20,2,2
4,"If you like period pieces - clothing, lingo, y...",28,0,106,6,0,0,2,2,0,22,0,2


In [11]:
# Load the dataset into a DataFrame
temp_df = pd.read_csv('final_dataset_with_bert_vader.csv')

temp_df.head()

Unnamed: 0,review_text,human_rating,BERT_Sentiment,VADER_Sentiment
0,I enjoy vintage books and movies so I enjoyed ...,5,4,5
1,This book is a reissue of an old one; the auth...,4,4,5
2,This was a fairly interesting read. It had ol...,4,4,5
3,I'd never read any of the Amy Brewster mysteri...,5,5,3
4,"If you like period pieces - clothing, lingo, y...",4,4,4


In [12]:
# Extract the needed columns from the dataset
needed_columns = temp_df[['human_rating', 'VADER_Sentiment', 'BERT_Sentiment']]

# Merge the columns with the 'new_df' DataFrame
new_df = pd.concat([new_df, needed_columns], axis=1)

# Check the number of rows and columns in the merged DataFrame
num_rows, num_columns = new_df.shape
print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

# Now, the 'new_df' DataFrame contains the 'human_rating', 'VADER_Sentiment', and 'BERT_Sentiment' columns
new_df.tail()

Number of rows: 641171
Number of columns: 16


Unnamed: 0,review_text,Word_Count,Slang_Count,Character_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count,Target,Predicted_Target,human_rating,VADER_Sentiment,BERT_Sentiment
641166,Yasss hunny! This is a great read. That Dre is...,105,0,367,11,0,1,17,4,2,89,0,1,5,5,5
641167,I ENJOYED THIS BOOK FROM BEGINNING TO END NOW ...,58,0,216,2,0,0,213,3,5,48,2,2,5,4,5
641168,Great book! Cherika was a fool. She let that m...,73,0,236,9,0,4,9,3,3,58,2,2,5,2,5
641169,When I say this was an excellent book please b...,51,0,198,5,0,1,3,3,1,42,0,2,5,5,5
641170,This book was everything. I just hope Alexus w...,85,0,321,9,0,0,20,3,5,68,2,1,5,4,5


In [13]:
import numpy as np

# Create the 'final_sentiment' column based on the 'Predicted_Target' values
new_df['final_sentiment'] = np.where(new_df['Predicted_Target'] == 1, new_df['VADER_Sentiment'], new_df['BERT_Sentiment'])

# Now, the 'new_df' DataFrame contains the 'final_sentiment' column
new_df.head()

Unnamed: 0,review_text,Word_Count,Slang_Count,Character_Count,Punctuation_Count,Emoticon_Count,Emotion_Punctuation_Count,Capital_Letter_Count,Positive_Words_Count,Negative_Words_Count,Neutral_Words_Count,Target,Predicted_Target,human_rating,VADER_Sentiment,BERT_Sentiment,final_sentiment
0,I enjoy vintage books and movies so I enjoyed ...,57,0,240,6,0,0,5,3,1,49,1,2,5,5,4,4
1,This book is a reissue of an old one; the auth...,118,0,375,36,0,0,11,5,3,72,2,2,4,5,4,4
2,This was a fairly interesting read. It had ol...,72,0,311,15,0,0,7,6,1,55,2,2,4,5,4,4
3,I'd never read any of the Amy Brewster mysteri...,23,0,81,4,0,0,5,0,0,20,2,2,5,3,5,5
4,"If you like period pieces - clothing, lingo, y...",28,0,106,6,0,0,2,2,0,22,0,2,4,4,4,4


In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate MAE and MSE between human_rating and final_sentiment columns
mae_final_sentiment = mean_absolute_error(new_df['human_rating'], new_df['final_sentiment'])
mse_final_sentiment = mean_squared_error(new_df['human_rating'], new_df['final_sentiment'])

# Calculate MAE and MSE between human_rating and BERT_Sentiment columns
mae_bert_sentiment = mean_absolute_error(new_df['human_rating'], new_df['BERT_Sentiment'])
mse_bert_sentiment = mean_squared_error(new_df['human_rating'], new_df['BERT_Sentiment'])

# Calculate MAE and MSE between human_rating and VADER_Sentiment columns
mae_vader_sentiment = mean_absolute_error(new_df['human_rating'], new_df['VADER_Sentiment'])
mse_vader_sentiment = mean_squared_error(new_df['human_rating'], new_df['VADER_Sentiment'])

# Print the results
print("human_rating vs. final_sentiment:")
print("MAE:", mae_final_sentiment)
print("MSE:", mse_final_sentiment)
print()

print("human_rating vs. BERT_Sentiment:")
print("MAE:", mae_bert_sentiment)
print("MSE:", mse_bert_sentiment)
print()

print("human_rating vs. VADER_Sentiment:")
print("MAE:", mae_vader_sentiment)
print("MSE:", mse_vader_sentiment)

human_rating vs. final_sentiment:
MAE: 0.44972558022742765
MSE: 0.6789748756571959

human_rating vs. BERT_Sentiment:
MAE: 0.48123979406429795
MSE: 0.7295011783128058

human_rating vs. VADER_Sentiment:
MAE: 0.6605102227019001
MSE: 1.1540634245778427


In [15]:
from sklearn.metrics import classification_report

# Define true labels (y_true) and predicted labels for each approach
y_true = new_df['human_rating']
y_pred_hybrid = new_df['final_sentiment']
y_pred_bert = new_df['BERT_Sentiment']
y_pred_vader = new_df['VADER_Sentiment']

# Calculate classification report for each approach
report_hybrid = classification_report(y_true, y_pred_hybrid)
report_bert = classification_report(y_true, y_pred_bert)
report_vader = classification_report(y_true, y_pred_vader)

# Print the reports
print("Hybrid Approach:")
print(report_hybrid)

print("\nBERT Approach:")
print(report_bert)

print("\nVADER Approach:")
print(report_vader)

Hybrid Approach:
              precision    recall  f1-score   support

           1       0.45      0.47      0.46     16471
           2       0.32      0.54      0.41     22074
           3       0.43      0.49      0.46     60227
           4       0.42      0.46      0.44    152161
           5       0.82      0.73      0.77    390238

    accuracy                           0.63    641171
   macro avg       0.49      0.54      0.51    641171
weighted avg       0.66      0.63      0.64    641171


BERT Approach:
              precision    recall  f1-score   support

           1       0.42      0.48      0.45     16471
           2       0.31      0.55      0.40     22074
           3       0.41      0.51      0.45     60227
           4       0.40      0.57      0.47    152161
           5       0.86      0.64      0.74    390238

    accuracy                           0.61    641171
   macro avg       0.48      0.55      0.50    641171
weighted avg       0.68      0.61      0.63 

In [16]:
# Calculate Exact Match Accuracy for final sentiment
exact_match_accuracy_final = (new_df['final_sentiment'] == new_df['human_rating']).mean()

# Calculate Off-by-1 Accuracy for final sentiment
off_by_1_accuracy_final = ((new_df['final_sentiment'] - new_df['human_rating']).abs() <= 1).mean()

print("Final Sentiment:")
print("Exact Match Accuracy:", exact_match_accuracy_final)
print("Accuracy (Off-by-1):", off_by_1_accuracy_final)

# Calculate Exact Match Accuracy for BERT sentiment
exact_match_accuracy_bert = (new_df['BERT_Sentiment'] == new_df['human_rating']).mean()

# Calculate Off-by-1 Accuracy for BERT sentiment
off_by_1_accuracy_bert = ((new_df['BERT_Sentiment'] - new_df['human_rating']).abs() <= 1).mean()

print("\nBERT Sentiment:")
print("Exact Match Accuracy:", exact_match_accuracy_bert)
print("Accuracy (Off-by-1):", off_by_1_accuracy_bert)

# Calculate Exact Match Accuracy for VADER sentiment
exact_match_accuracy_vader = (new_df['VADER_Sentiment'] == new_df['human_rating']).mean()

# Calculate Off-by-1 Accuracy for VADER sentiment
off_by_1_accuracy_vader = ((new_df['VADER_Sentiment'] - new_df['human_rating']).abs() <= 1).mean()

print("\nVADER Sentiment:")
print("Exact Match Accuracy:", exact_match_accuracy_vader)
print("Accuracy (Off-by-1):", off_by_1_accuracy_vader)

Final Sentiment:
Exact Match Accuracy: 0.6322931012163682
Accuracy (Off-by-1): 0.9433084153837276

BERT Sentiment:
Exact Match Accuracy: 0.6059288395763377
Accuracy (Off-by-1): 0.94132454524612

VADER Sentiment:
Exact Match Accuracy: 0.5254729237598083
Accuracy (Off-by-1): 0.8657269277618607


In [17]:
# Count the occurrences of each value in the 'Predicted_Target' column
predicted_target_counts = new_df['Predicted_Target'].value_counts()

# Print the counts
print("Count of Predicted_Target values:")
print(predicted_target_counts)


Count of Predicted_Target values:
Predicted_Target
2    441575
1    199596
Name: count, dtype: int64
