In [1]:
%matplotlib inline

In [2]:
# my imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [3]:
reviews_df = pd.read_csv('data/cleaned_reviews_for_training_scifi.csv')

In [4]:
# Convert stars to sentiment labels
reviews_df['sentiment'] = reviews_df['stars_given'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(reviews_df['review'], reviews_df['sentiment'], test_size=0.2, random_state=42)


In [6]:
# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
# Initialize and train the SVM classifier
model = SVC(kernel='linear')  # Linear kernel is often used for text classification
model.fit(X_train_tfidf, y_train)

In [8]:
# Make predictions
y_pred = model.predict(X_test_tfidf)

In [9]:
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.74      0.42      0.53       235
     neutral       0.75      0.13      0.22       343
    positive       0.79      0.99      0.88      1547

    accuracy                           0.78      2125
   macro avg       0.76      0.51      0.54      2125
weighted avg       0.78      0.78      0.73      2125



In [10]:
accuracy_score(model.predict(X_test_tfidf), y_test)

0.7844705882352941

In [11]:
reviews_to_predict_df = pd.read_csv('data/cleaned_reviews_for_training_various.csv')

In [13]:
reviews_to_predict_df.head()

Unnamed: 0,#,book_name,reviewer_anonymous,reviewer_rating,review_description,date
0,0,The Woman in Me,6ce3606d5be9785bcd9b10b844b98cf3b337b7a97a7f9c...,4,I'm only a third way in. Shipped lightening fa...,26-10-2023
1,1,The Woman in Me,243aa726ab7df2a7630a3a36c7d3a12f14e9d80cd3ab83...,5,"""There have been so many times when I was scar...",06-11-2023
2,2,The Woman in Me,eaea2ab37288945d63173beddf5680a39c37672c4386d6...,5,The media could not be loaded. I personally ha...,01-11-2023
3,3,The Woman in Me,9e554d1ebb53e03ec42b99ae5842c8a7309af90010bc51...,5,I have been a fan of Britney's music since the...,25-10-2023
4,4,The Woman in Me,4795e19c3660f232dd519252ac99d99fd53e23c7cf9a5e...,5,"Whether or not you’re a fan, it’s a great read...",01-11-2023


In [14]:
# Convert stars to sentiment labels
reviews_to_predict_df['sentiment_given'] = reviews_to_predict_df['reviewer_rating'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

In [22]:
# Convert text data to numerical data using TF-IDF
X_predict_tfidf = vectorizer.transform(reviews_to_predict_df['review_description'])

In [25]:
sentiment_pred = model.predict(X_predict_tfidf)

In [27]:
reviews_to_predict_df['sentiment_predicted'] = sentiment_pred

In [28]:
reviews_to_predict_df.head()

Unnamed: 0,#,book_name,reviewer_anonymous,reviewer_rating,review_description,date,sentiment_given,sentiment_predicted
0,0,The Woman in Me,6ce3606d5be9785bcd9b10b844b98cf3b337b7a97a7f9c...,4,I'm only a third way in. Shipped lightening fa...,26-10-2023,positive,positive
1,1,The Woman in Me,243aa726ab7df2a7630a3a36c7d3a12f14e9d80cd3ab83...,5,"""There have been so many times when I was scar...",06-11-2023,positive,positive
2,2,The Woman in Me,eaea2ab37288945d63173beddf5680a39c37672c4386d6...,5,The media could not be loaded. I personally ha...,01-11-2023,positive,positive
3,3,The Woman in Me,9e554d1ebb53e03ec42b99ae5842c8a7309af90010bc51...,5,I have been a fan of Britney's music since the...,25-10-2023,positive,positive
4,4,The Woman in Me,4795e19c3660f232dd519252ac99d99fd53e23c7cf9a5e...,5,"Whether or not you’re a fan, it’s a great read...",01-11-2023,positive,positive


In [21]:
reviews_to_predict_df.groupby('sentiment_given').reviewer_rating.count()

sentiment_given
negative      4
neutral       8
positive    905
Name: reviewer_rating, dtype: int64

In [29]:
reviews_to_predict_df.groupby('sentiment_predicted').reviewer_rating.count()

sentiment_predicted
negative     11
neutral       3
positive    903
Name: reviewer_rating, dtype: int64

In [30]:
reviews_to_predict_df.shape

(917, 8)

In [31]:
reviews_to_predict_df[reviews_to_predict_df['sentiment_given'] == reviews_to_predict_df['sentiment_predicted']].shape

(894, 8)

In [33]:
reviews_to_predict_df.to_csv('data/reviews_with_predicted_v_given_sentiment.csv', index = None)

In [35]:
reviews_to_predict_df[reviews_to_predict_df['sentiment_given'] != reviews_to_predict_df['sentiment_predicted']][['reviewer_rating', 'review_description', 'sentiment_given', 'sentiment_predicted']]

Unnamed: 0,reviewer_rating,review_description,sentiment_given,sentiment_predicted
14,3,"So, I will review this as two people: A writer...",neutral,positive
61,2,Anthony Doerr’s descriptions are like no other...,negative,positive
119,5,This book is PACKED with interesting informati...,positive,negative
129,5,After the loss of my wife I found myself looki...,positive,negative
197,3,ஐBeauty and the Beast meets Faerie ஐ – I shoul...,neutral,positive
199,5,Okay so... I might lost some of my favorite cr...,positive,negative
269,5,I loved this book so I purchased 2 other books...,positive,negative
272,4,Okay so I liked it but didn't love it. It was ...,positive,negative
308,3,How to catch a mermaid was adorable - this boo...,neutral,positive
367,3,This short book is rather ambitious in the sen...,neutral,positive
