In [14]:
# 1️⃣ Imports
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 2️⃣ Load data & vectorizer
df = pd.read_csv("../data/reviews_cleaned.csv")
tfidf = joblib.load("../data/tfidf.pkl")

X = df["reviewText"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# 3️⃣ Random Forest
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf.fit(X_train_tfidf, y_train)
rf_pred = rf.predict(X_test_tfidf)
rf_acc = accuracy_score(y_test, rf_pred)

# 4️⃣ XGBoost
xgb = XGBClassifier(
    eval_metric="logloss",
    use_label_encoder=False
)
xgb.fit(X_train_tfidf, y_train)
xgb_pred = xgb.predict(X_test_tfidf)
xgb_acc = accuracy_score(y_test, xgb_pred)

# 5️⃣ LightGBM
lgb = LGBMClassifier()
lgb.fit(X_train_tfidf, y_train)
lgb_pred = lgb.predict(X_test_tfidf)
lgb_acc = accuracy_score(y_test, lgb_pred)

# 6️⃣ Save results
joblib.dump(rf_acc, "../data/rf_acc.pkl")
joblib.dump(xgb_acc, "../data/xgb_acc.pkl")
joblib.dump(lgb_acc, "../data/lgb_acc.pkl")

print("RF:", rf_acc)
print("XGB:", xgb_acc)
print("LGB:", lgb_acc)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


KeyboardInterrupt: 

In [6]:
df = df[['reviewText', 'overall']].dropna()
df.head()


Unnamed: 0,reviewText,overall
0,The portfolio is fine except for the fact that...,3.0
1,If you are a serious violin student on a budge...,5.0
2,This is and excellent edition and perfectly tr...,5.0
3,Perfect for someone who is an opera fan or a w...,5.0
4,How many Nocturnes does it contain? All of the...,1.0


In [7]:
df = df[df['reviewText'].str.len() > 20]


In [8]:
df = df[df['overall'] != 3]  # drop neutral 3-star reviews
df['sentiment'] = df['overall'].apply(lambda x: 1 if x >= 4 else 0)
df.head()


Unnamed: 0,reviewText,overall,sentiment
1,If you are a serious violin student on a budge...,5.0,1
2,This is and excellent edition and perfectly tr...,5.0,1
3,Perfect for someone who is an opera fan or a w...,5.0,1
4,How many Nocturnes does it contain? All of the...,1.0,0
5,"this was written for Carin Levine in 2008, but...",5.0,1


In [9]:
df.to_csv("../data/reviews_cleaned.csv", index=False)


In [10]:
df = df[['reviewText', 'overall']].dropna()
df.head()


Unnamed: 0,reviewText,overall
1,If you are a serious violin student on a budge...,5.0
2,This is and excellent edition and perfectly tr...,5.0
3,Perfect for someone who is an opera fan or a w...,5.0
4,How many Nocturnes does it contain? All of the...,1.0
5,"this was written for Carin Levine in 2008, but...",5.0


In [11]:
def to_sentiment(rating):
    if rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"

df['sentiment'] = df['overall'].apply(to_sentiment)
df.head()


Unnamed: 0,reviewText,overall,sentiment
1,If you are a serious violin student on a budge...,5.0,positive
2,This is and excellent edition and perfectly tr...,5.0,positive
3,Perfect for someone who is an opera fan or a w...,5.0,positive
4,How many Nocturnes does it contain? All of the...,1.0,negative
5,"this was written for Carin Levine in 2008, but...",5.0,positive


In [12]:
df['sentiment'].value_counts()


sentiment
positive    402340
negative     57428
Name: count, dtype: int64

In [13]:
import pandas as pd

df = pd.read_csv("../data/reviews_cleaned.csv")

# Create sentiment labels from rating
df['overall_sentiment'] = df['overall'].apply(lambda x: 1 if x >= 4 else 0)

# Save permanently
df.to_csv("../data/reviews_cleaned.csv", index=False)

df.head()


Unnamed: 0,reviewText,overall,sentiment,overall_sentiment
0,If you are a serious violin student on a budge...,5.0,1,1
1,This is and excellent edition and perfectly tr...,5.0,1,1
2,Perfect for someone who is an opera fan or a w...,5.0,1,1
3,How many Nocturnes does it contain? All of the...,1.0,0,0
4,"this was written for Carin Levine in 2008, but...",5.0,1,1
