In [2]:
import fasttext
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter


df = pd.read_csv("tripadvisor_hotel_reviews.csv")

TEXT_COL = "Review"
RATING_COL = "Rating"

def map_to_sentiment(r):
    if r in [1, 2]:
        return 0  # negativo
    elif r == 3:
        return 1  # neutro
    else:  # 4 o 5
        return 2  # positivo

df["sentiment3"] = df[RATING_COL].apply(map_to_sentiment)

texts = df[TEXT_COL].astype(str).tolist()
labels = df["sentiment3"].values


In [3]:
X_train_text, X_test_text, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# addestra modello fastText non supervisionato sui testi di train
with open("train_texts.txt", "w", encoding="utf-8") as f:
    for t in X_train_text:
        f.write(t.replace("\n", " ") + "\n")

ft_model = fasttext.train_unsupervised("train_texts.txt", model="skipgram",dim=100)

def doc_to_vec(text, model, dim=100):
    words = text.split()
    vecs = [model.get_word_vector(w) for w in words]
    if len(vecs) == 0:
        return np.zeros(dim)
    return np.mean(vecs, axis=0)

dim = 100
X_train_ft = np.vstack([doc_to_vec(t, ft_model, dim) for t in X_train_text])
X_test_ft  = np.vstack([doc_to_vec(t, ft_model, dim) for t in X_test_text])

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_ft, y_train)

print("Dopo SMOTE:", Counter(y_train_res))

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_res, y_train_res)
y_pred = clf.predict(X_test_ft)
print(classification_report(y_test, y_pred))


Read 1M words
Number of words:  17146
Number of labels: 0
Progress: 100.0% words/sec/thread:   28868 lr:  0.000000 avg.loss:  2.276231 ETA:   0h 0m 0s


Dopo SMOTE: Counter({np.int64(2): 12074, np.int64(0): 12074, np.int64(1): 12074})
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       643
           1       0.29      0.59      0.39       437
           2       0.96      0.80      0.87      3019

    accuracy                           0.78      4099
   macro avg       0.67      0.73      0.68      4099
weighted avg       0.86      0.78      0.81      4099



In [4]:

print("Prima:", Counter(y_train))

rus = RandomUnderSampler(
    sampling_strategy={2: 6000},  # tieni 8000 esempi della classe 2
    random_state=42
)
X_train_under, y_train_under = rus.fit_resample(X_train_ft, y_train)
print("Dopo undersampling:", Counter(y_train_under))


Prima: Counter({np.int64(2): 12074, np.int64(0): 2571, np.int64(1): 1747})
Dopo undersampling: Counter({np.int64(2): 6000, np.int64(0): 2571, np.int64(1): 1747})


In [15]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(
    sampling_strategy={0: 6000, 1: 6000, 2: 6000},  # target finale
    random_state=42
)
X_train_bal, y_train_bal = sm.fit_resample(X_train_under, y_train_under)

from collections import Counter
print("Dopo undersampling + SMOTE:", Counter(y_train_bal))


Dopo undersampling + SMOTE: Counter({np.int64(0): 6000, np.int64(1): 6000, np.int64(2): 6000})


In [16]:
# 3) Riaddestri il classificatore
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_bal, y_train_bal)

# 4) Test sullo stesso test set di prima
y_pred = clf.predict(X_test_ft)
print(classification_report(y_test, y_pred,digits=3))

              precision    recall  f1-score   support

           0      0.773     0.782     0.777       643
           1      0.281     0.597     0.382       437
           2      0.957     0.798     0.870      3019

    accuracy                          0.774      4099
   macro avg      0.670     0.726     0.677      4099
weighted avg      0.856     0.774     0.804      4099



In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Logistic Regression
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "class_weight": [None, "balanced"]
}
lr = LogisticRegression(max_iter=2000)
grid_lr = GridSearchCV(
    lr, param_grid_lr,
    cv=3, scoring="f1_macro", n_jobs=-1
)
grid_lr.fit(X_train_bal, y_train_bal)
print("Best LR params:", grid_lr.best_params_)

# Linear SVM
param_grid_svm = {
    "C": [0.01, 0.1, 1, 10],
    "class_weight": [None, "balanced"]
}
svm = LinearSVC()
grid_svm = GridSearchCV(
    svm, param_grid_svm,
    cv=3, scoring="f1_macro", n_jobs=-1
)
grid_svm.fit(X_train_bal, y_train_bal)
print("Best SVM params:", grid_svm.best_params_)


Best LR params: {'C': 10, 'class_weight': None, 'penalty': 'l2'}
Best SVM params: {'C': 1, 'class_weight': None}


In [18]:
best_lr = LogisticRegression(
    C=10,
    class_weight="balanced",
    penalty="l2",
    max_iter=2000
)
best_lr.fit(X_train_bal, y_train_bal)

best_svm = LinearSVC(
    C=1,
    class_weight="balanced"
)
best_svm.fit(X_train_bal, y_train_bal)


0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [19]:
from sklearn.metrics import classification_report

print("=== LR ===")
print(classification_report(y_test, best_lr.predict(X_test_ft)))

print("=== SVM ===")
print(classification_report(y_test, best_svm.predict(X_test_ft)))


=== LR ===
              precision    recall  f1-score   support

           0       0.78      0.79      0.78       643
           1       0.28      0.60      0.38       437
           2       0.96      0.80      0.87      3019

    accuracy                           0.78      4099
   macro avg       0.67      0.73      0.68      4099
weighted avg       0.86      0.78      0.81      4099

=== SVM ===
              precision    recall  f1-score   support

           0       0.72      0.81      0.76       643
           1       0.28      0.51      0.36       437
           2       0.95      0.82      0.88      3019

    accuracy                           0.78      4099
   macro avg       0.65      0.71      0.67      4099
weighted avg       0.84      0.78      0.81      4099

