In [1]:
%matplotlib inline

In [14]:
# my imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import math

In [3]:
# Sample DataFrame
data = {
    'review': [
        "I loved the book!", 
        "It was okay.", 
        "Not my type of book.", 
        "Fantastic read, highly recommend!",
        "Hated it, very disappointing.",
        "A mediocre book, could be better."
    ],
    'stars_given': [5, 3, 2, 5, 1, 3]
}

df = pd.DataFrame(data)

In [4]:
# Convert star ratings to sentiment labels
df['sentiment'] = df['stars_given'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))


In [5]:
df

Unnamed: 0,review,stars_given,sentiment
0,I loved the book!,5,positive
1,It was okay.,3,neutral
2,Not my type of book.,2,negative
3,"Fantastic read, highly recommend!",5,positive
4,"Hated it, very disappointing.",1,negative
5,"A mediocre book, could be better.",3,neutral


In [6]:
# Features and Labels
X = df['review']
y = df['sentiment']

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
print('X_train:',X_train)
print('X_test:',X_test)
print('y_train:',y_train)
print('y_test:',y_test)

X_train: 5    A mediocre book, could be better.
2                 Not my type of book.
4        Hated it, very disappointing.
3    Fantastic read, highly recommend!
Name: review, dtype: object
X_test: 0    I loved the book!
1         It was okay.
Name: review, dtype: object
y_train: 5     neutral
2    negative
4    negative
3    positive
Name: sentiment, dtype: object
y_test: 0    positive
1     neutral
Name: sentiment, dtype: object


In [9]:
# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
# Initialize and train the SVM classifier
model = SVC(kernel='linear')  # Linear kernel is often used for text classification
model.fit(X_train_tfidf, y_train)

In [11]:
# Make predictions
y_pred = model.predict(X_test_tfidf)

In [12]:
y_pred

array(['negative', 'negative'], dtype=object)

In [13]:
reviews_df = pd.read_csv('data/cleaned_reviews_for_training_scifi.csv')

In [15]:
# Convert stars to sentiment labels
reviews_df['sentiment'] = reviews_df['stars_given'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

In [16]:
rows = len(reviews_df)

training_rows = math.floor(rows * 0.8)
training_rows

testing_rows = math.ceil(rows * 0.2)
testing_rows

2125

In [17]:
training_df = reviews_df.head(training_rows)

testing_df = reviews_df.tail(testing_rows)

In [18]:
X_train, y_train = training_df['review'], training_df['sentiment']

X_test, y_test = testing_df['review'], testing_df['sentiment']

In [19]:
# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [20]:
# Initialize and train the SVM classifier
model = SVC(kernel='linear')  # Linear kernel is often used for text classification
model.fit(X_train_tfidf, y_train)

In [21]:
# Make predictions
y_pred = model.predict(X_test_tfidf)

In [22]:
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.59      0.24      0.34       246
     neutral       0.34      0.05      0.09       402
    positive       0.73      0.96      0.83      1477

    accuracy                           0.71      2125
   macro avg       0.55      0.42      0.42      2125
weighted avg       0.64      0.71      0.63      2125



In [23]:
print(y_pred)
print(y_test)

['positive' 'negative' 'positive' ... 'positive' 'positive' 'negative']
8499     positive
8500     negative
8501     positive
8502     positive
8503     positive
           ...   
10619    positive
10620    positive
10621    positive
10622    positive
10623     neutral
Name: sentiment, Length: 2125, dtype: object
