In [1]:
%matplotlib inline

In [31]:
# my imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import math

In [3]:
# Sample DataFrame
data = {
    'review': ["I loved the book!", "It was okay.", "Not my type of book.", "Fantastic read, highly recommend!"],
    'stars_given': [5, 3, 2, 5]  # Assuming stars 1-2 = Negative, 3 = Neutral, 4-5 = Positive
}

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,review,stars_given
0,I loved the book!,5
1,It was okay.,3
2,Not my type of book.,2
3,"Fantastic read, highly recommend!",5


In [6]:
# Convert stars to sentiment labels
df['sentiment'] = df['stars_given'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

In [7]:
df

Unnamed: 0,review,stars_given,sentiment
0,I loved the book!,5,positive
1,It was okay.,3,neutral
2,Not my type of book.,2,negative
3,"Fantastic read, highly recommend!",5,positive


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [9]:
print('X_train:',X_train)
print('X_test:',X_test)
print('y_train:',y_train)
print('y_test:',y_test)

X_train: 3    Fantastic read, highly recommend!
0                    I loved the book!
2                 Not my type of book.
Name: review, dtype: object
X_test: 1    It was okay.
Name: review, dtype: object
y_train: 3    positive
0    positive
2    negative
Name: sentiment, dtype: object
y_test: 1    neutral
Name: sentiment, dtype: object


In [10]:
# Create a pipeline with TF-IDF and Logistic Regression
pipeline = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression()
)

In [11]:
# Train the model
pipeline.fit(X_train, y_train)

In [12]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)

In [13]:
y_pred

array(['positive'], dtype=object)

In [16]:
reviews_df = pd.read_csv('data/cleaned_reviews_for_training_scifi.csv')

In [29]:
reviews_df.head()


Unnamed: 0,#,date,book_title,reviewer_anonymous,review,stars_given
0,0,2011-03-22,Dune,c5cdbdf3fc731351e44b01b9d786e167adf2c74434a318...,"In my head, the purpose of this review is very...",5.0
1,1,2014-04-29,Dune,0c9ab8c55d678ea513149008ce0ca19990aabc50f6bde8...,If this is the gold standard against which all...,1.0
2,2,2008-11-20,Dune,06db99a23f69d8ddd8212272bd26a7ea2c02b37a146bbf...,There's a characteristically witty essay by Bo...,4.0
3,3,2007-06-24,Dune,80dcb1d46270dfe2f2afd3349e4f3c3500ae8c853e8575...,No one should argue the importance Dune. It la...,2.0
4,4,2021-11-13,Dune,f917d3a786598500bb9dd905651958265a1ae3289c8964...,just when you thought 2021 couldn't get any we...,5.0


In [34]:
# Convert stars to sentiment labels
reviews_df['sentiment'] = reviews_df['stars_given'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

In [35]:
rows = len(reviews_df)

In [36]:
training_rows = math.floor(rows * 0.8)
training_rows

8499

In [37]:
testing_rows = math.ceil(rows * 0.2)
testing_rows

2125

In [38]:
training_df = reviews_df.head(8499)

In [39]:
testing_df = reviews_df.tail(2125)

In [40]:
X_train, y_train = training_df['review'], training_df['sentiment']

In [41]:
X_test, y_test = testing_df['review'], testing_df['sentiment']

In [42]:
# Train the model
pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)

In [51]:
print(y_pred)
print(y_test)

['positive' 'positive' 'positive' ... 'positive' 'positive' 'positive']
8499     positive
8500     negative
8501     positive
8502     positive
8503     positive
           ...   
10619    positive
10620    positive
10621    positive
10622    positive
10623     neutral
Name: sentiment, Length: 2125, dtype: object


In [49]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.62      0.13      0.22       246
     neutral       0.33      0.04      0.07       402
    positive       0.71      0.98      0.82      1477

    accuracy                           0.70      2125
   macro avg       0.56      0.38      0.37      2125
weighted avg       0.63      0.70      0.61      2125

