In [2]:
%matplotlib inline

### SciKit-Learn Support Vector Machine sentiment analysis

<a id='svm'></a>

In [3]:
# my imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import pickle

In [4]:
reviews_df = pd.read_csv('data/cleaned_reviews_for_training_scifi.csv')

In [5]:
reviews_df = reviews_df.dropna()

In [6]:
# Convert stars to sentiment labels
reviews_df['sentiment'] = reviews_df['stars_given'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(reviews_df['cleaned_review'], reviews_df['sentiment'], test_size=0.2, random_state=42)


In [8]:
# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
# Initialize and train the SVM classifier
model = SVC(kernel='linear')  # Linear kernel is often used for text classification
model.fit(X_train_tfidf, y_train)

In [10]:
# Make predictions
y_pred = model.predict(X_test_tfidf)

In [11]:
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.74      0.41      0.53       244
     neutral       0.62      0.17      0.27       343
    positive       0.79      0.98      0.87      1538

    accuracy                           0.78      2125
   macro avg       0.72      0.52      0.56      2125
weighted avg       0.76      0.78      0.74      2125



In [12]:
accuracy_score(model.predict(X_test_tfidf), y_test)

0.7811764705882352

In [13]:
reviews_to_predict_df = pd.read_csv('data/cleaned_reviews_for_training_various.csv')

In [14]:
reviews_to_predict_df.head()

Unnamed: 0,#,book_name,reviewer_anonymous,reviewer_rating,review_description,date,cleaned_review
0,0,The Woman in Me,6ce3606d5be9785bcd9b10b844b98cf3b337b7a97a7f9c...,4,I'm only a third way in. Shipped lightening fa...,26-10-2023,im third way shipped lightening fastthe print ...
1,1,The Woman in Me,243aa726ab7df2a7630a3a36c7d3a12f14e9d80cd3ab83...,5,"""There have been so many times when I was scar...",06-11-2023,many time scared speak afraid somebody would t...
2,2,The Woman in Me,eaea2ab37288945d63173beddf5680a39c37672c4386d6...,5,The media could not be loaded. I personally ha...,01-11-2023,medium could loaded personally exact situation...
3,3,The Woman in Me,9e554d1ebb53e03ec42b99ae5842c8a7309af90010bc51...,5,I have been a fan of Britney's music since the...,25-10-2023,fan britneys music since 00s couple year older...
4,4,The Woman in Me,4795e19c3660f232dd519252ac99d99fd53e23c7cf9a5e...,5,"Whether or not you’re a fan, it’s a great read...",01-11-2023,whether ’ fan ’ great read britney take world ...


In [15]:
# Convert stars to sentiment labels
reviews_to_predict_df['sentiment_given'] = reviews_to_predict_df['reviewer_rating'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

In [16]:
# Convert text data to numerical data using TF-IDF
X_predict_tfidf = vectorizer.transform(reviews_to_predict_df['cleaned_review'])

In [17]:
sentiment_pred = model.predict(X_predict_tfidf)

In [18]:
reviews_to_predict_df['sentiment_predicted'] = sentiment_pred

In [19]:
reviews_to_predict_df.head()

Unnamed: 0,#,book_name,reviewer_anonymous,reviewer_rating,review_description,date,cleaned_review,sentiment_given,sentiment_predicted
0,0,The Woman in Me,6ce3606d5be9785bcd9b10b844b98cf3b337b7a97a7f9c...,4,I'm only a third way in. Shipped lightening fa...,26-10-2023,im third way shipped lightening fastthe print ...,positive,positive
1,1,The Woman in Me,243aa726ab7df2a7630a3a36c7d3a12f14e9d80cd3ab83...,5,"""There have been so many times when I was scar...",06-11-2023,many time scared speak afraid somebody would t...,positive,positive
2,2,The Woman in Me,eaea2ab37288945d63173beddf5680a39c37672c4386d6...,5,The media could not be loaded. I personally ha...,01-11-2023,medium could loaded personally exact situation...,positive,positive
3,3,The Woman in Me,9e554d1ebb53e03ec42b99ae5842c8a7309af90010bc51...,5,I have been a fan of Britney's music since the...,25-10-2023,fan britneys music since 00s couple year older...,positive,positive
4,4,The Woman in Me,4795e19c3660f232dd519252ac99d99fd53e23c7cf9a5e...,5,"Whether or not you’re a fan, it’s a great read...",01-11-2023,whether ’ fan ’ great read britney take world ...,positive,positive


In [20]:
reviews_to_predict_df.groupby('sentiment_given').reviewer_rating.count()

sentiment_given
negative      4
neutral       8
positive    905
Name: reviewer_rating, dtype: int64

In [21]:
reviews_to_predict_df.groupby('sentiment_predicted').reviewer_rating.count()

sentiment_predicted
negative     12
neutral       5
positive    900
Name: reviewer_rating, dtype: int64

In [22]:
reviews_to_predict_df.shape

(917, 9)

In [23]:
reviews_to_predict_df[reviews_to_predict_df['sentiment_given'] == reviews_to_predict_df['sentiment_predicted']].shape

(893, 9)

In [24]:
prediction_accuracy_vs_actual = (893 / 917) * 100
prediction_accuracy_vs_actual

97.38276990185388

Prediction accuracy is ~97%

In [25]:
reviews_to_predict_df.to_csv('data/reviews_with_predicted_v_given_sentiment.csv', index = None)

In [26]:
reviews_to_predict_df[reviews_to_predict_df['sentiment_given'] != reviews_to_predict_df['sentiment_predicted']][['reviewer_rating', 'review_description', 'sentiment_given', 'sentiment_predicted']]

Unnamed: 0,reviewer_rating,review_description,sentiment_given,sentiment_predicted
14,3,"So, I will review this as two people: A writer...",neutral,positive
119,5,This book is PACKED with interesting informati...,positive,negative
145,4,This is not your typical John Grisham book. Th...,positive,neutral
191,5,I bought the trilogy. All together they were t...,positive,negative
197,3,ஐBeauty and the Beast meets Faerie ஐ – I shoul...,neutral,positive
199,5,Okay so... I might lost some of my favorite cr...,positive,negative
245,5,ING antone else remember this from their child...,positive,neutral
269,5,I loved this book so I purchased 2 other books...,positive,negative
272,4,Okay so I liked it but didn't love it. It was ...,positive,negative
308,3,How to catch a mermaid was adorable - this boo...,neutral,positive


In [27]:
# Define the file path where you want to save the model
filename = 'models/svm_model.pkl'

# Use the pickle.dump() method to save the model
with open(filename, 'wb') as file:
    pickle.dump(model, file)

In [28]:
y_test.shape

(2125,)

In [29]:
y_test_export = list (y_test)

In [30]:
y_pred_export = list(y_pred)

In [31]:
x_test_export = list(X_test)

In [32]:
for_export = {'y_tes' : y_test_export,
            'y_pred' : y_pred_export,
            'X_test' : x_test_export}

In [34]:
df = pd.DataFrame(for_export)

In [35]:
df.to_csv('data/svm_training_data.csv')