In [20]:
import fasttext
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import re
from collections import defaultdict
from rake_nltk import Rake
import contractions
from tabulate import tabulate
import textwrap
import contractions
import textacy
import spacy

try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

df = pd.read_csv("tripadvisor_hotel_reviews.csv")

TEXT_COL = "Review"
RATING_COL = "Rating"

def map_to_sentiment(r):
    if r in [1, 2]:
        return 0  # negativo
    elif r == 3:
        return 1  # neutro
    else:  # 4 o 5
        return 2  # positivo

df["sentiment3"] = df[RATING_COL].apply(map_to_sentiment)

texts = df[TEXT_COL].astype(str).tolist()
labels = df["sentiment3"].values


In [5]:
df_pos = df[df['Rating'] >= 4].copy()

df_neg_neu = df[df['Rating'] <= 1].copy()



def clean_text(text):
    text = contractions.fix(text)   # didn't → did not
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

review_pos = df_pos['Review_clean'] = df_pos['Review'].apply(clean_text)
review_neg_neu = df_neg_neu['Review_clean'] = df_neg_neu['Review'].apply(clean_text)

# --- Estrazione basata su TF-IDF (N-grammi di parole) ---

# Inizializza TfidfVectorizer per estrarre 2-grammi e 3-grammi
vectorizer = TfidfVectorizer(
    stop_words=stopwords.words('english'), 
    ngram_range=(2, 3), # Estrae frasi composte da 2 o 3 parole
    max_features=2000
)

tfidf_matrix_neg = vectorizer.fit_transform(review_neg_neu)
tfidf_matrix_pos = vectorizer.transform(review_pos)
feature_names = vectorizer.get_feature_names_out()

# Calcola i punteggi medi TF-IDF per tutti i documenti analizzati
avg_tfidf_scores_neg = tfidf_matrix_neg.mean(axis=0).tolist()[0]
avg_tfidf_scores_pos = tfidf_matrix_pos.mean(axis=0).tolist()[0]
# Crea un DataFrame di risultati
tfidf_results_neg = pd.DataFrame({
    'Frase Chiave Negative/Neutre': feature_names,
    'Punteggio Medio TF-IDF': avg_tfidf_scores_neg
})

tfidf_results_pos = pd.DataFrame({
    'Frase Chiave Positive': feature_names,
    'Punteggio Medio TF-IDF': avg_tfidf_scores_pos
})

# Ordina e mostra le top 10 frasi chiave
top_10_tfidf_neg = tfidf_results_neg.sort_values(
    by='Punteggio Medio TF-IDF', 
    ascending=False
).head(5)

top_10_tfidf_pos = tfidf_results_pos.sort_values(
    by='Punteggio Medio TF-IDF', 
    ascending=False
).head(5)

print("### TOP 10 Frasi Chiave (Metodo TF-IDF) - Recensioni Negative/Neutre ###")
print(top_10_tfidf_neg.to_string(index=False))
print("\n### TOP 10 Frasi Chiave (Metodo TF-IDF) - Recensioni Positive ###")
print(top_10_tfidf_pos.to_string(index=False))

### TOP 10 Frasi Chiave (Metodo TF-IDF) - Recensioni Negative/Neutre ###
Frase Chiave Negative/Neutre  Punteggio Medio TF-IDF
                  punta cana                0.014306
                  stay hotel                0.013076
                  star hotel                0.012816
                 credit card                0.012046
            customer service                0.011984

### TOP 10 Frasi Chiave (Metodo TF-IDF) - Recensioni Positive ###
Frase Chiave Positive  Punteggio Medio TF-IDF
       great location                0.037657
       staff friendly                0.031515
          great hotel                0.030357
     friendly helpful                0.028837
     walking distance                0.028348


In [2]:
df_neg_neu2 = df[df['Rating'] <= 1].copy()
df_pos = df[df['Rating'] >= 4].copy()

In [4]:
# --- Funzione di pulizia ---
def clean_text(text):
    text = contractions.fix(text)   # didn't → did not
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# --- Stopwords custom ---
custom_stopwords = set(stopwords.words('english'))
custom_stopwords.update([
    'hotel', 'room', 'stay', 'stayed', 'recommend', 'would', 'could',
    'pretty', 'easy', 'know', 'things','nt', 'not','good', 'nice', 'bad', 'worst',
    'place', 'staff', 'star', 'stars', 'worth', 'money', 'make', 'sure', 'ask', 
    'time', 'course', 'returned', 'called', 'deserve'
])

# --- Pulizia recensioni ---
df_neg_neu2['Review_Clean'] = df_neg_neu2['Review'].apply(clean_text)
df_pos['Review_Clean'] = df_pos['Review'].apply(clean_text)

reviews_neg = df_neg_neu2['Review_Clean'].tolist()
reviews_pos = df_pos['Review_Clean'].tolist()

# --- Funzione RAKE generale ---
def extract_rake_phrases(reviews, stopwords_list, max_length=4, min_words=2):
    r = Rake(stopwords=list(stopwords_list), max_length=max_length)
    all_phrases = []
    
    for review in reviews:
        r.extract_keywords_from_text(review)
        for score, phrase in r.get_ranked_phrases_with_scores():
            # mantieni solo keyword con almeno min_words parole
            if len(phrase.split()) >= min_words:
                all_phrases.append({'Frase Chiave': phrase, 'Punteggio RAKE': score})
    
    # Aggregazione
    df = pd.DataFrame(all_phrases)
    if df.empty:
        return df
    
    agg_df = (
        df.groupby('Frase Chiave')
        .agg(
            Punteggio_Totale=('Punteggio RAKE', 'sum'),
            Frequenza=('Punteggio RAKE', 'count')
        )
        .reset_index()
        .sort_values(by=['Punteggio_Totale', 'Frequenza'], ascending=False)
    )
    return agg_df

# --- Estrazione frasi chiave ---
agg_rake_neg = extract_rake_phrases(reviews_neg, custom_stopwords)
agg_rake_pos = extract_rake_phrases(reviews_pos, custom_stopwords)

# --- Top 5 frasi chiave ---
top_5_neg = agg_rake_neg.head(5).copy()
top_5_pos = agg_rake_pos.head(5).copy()

# --- Wrapping per stampa ---
for df in [top_5_neg, top_5_pos]:
    df["Frase Chiave"] = df["Frase Chiave"].apply(
        lambda x: "\n".join(textwrap.wrap(x, width=150))
    )

# --- Stampa finale ---
print("### TOP 5 Frasi Chiave (Metodo RAKE) - Recensioni Negative ###")
print(tabulate(top_5_neg, headers='keys', tablefmt='psql', showindex=False))

print("\n### TOP 5 Frasi Chiave (Metodo RAKE) - Recensioni Positive ###")
print(tabulate(top_5_pos, headers='keys', tablefmt='psql', showindex=False))

### TOP 5 Frasi Chiave (Metodo RAKE) - Recensioni Negative ###
+--------------------------------+--------------------+-------------+
| Frase Chiave                   |   Punteggio_Totale |   Frequenza |
|--------------------------------+--------------------+-------------|
| customer service               |            40.3333 |          10 |
| shuttle took ridiculously long |            32      |           2 |
| air conditioning               |            30      |           7 |
| horrible experience            |            28      |           7 |
| hot water                      |            28      |           7 |
+--------------------------------+--------------------+-------------+

### TOP 5 Frasi Chiave (Metodo RAKE) - Recensioni Positive ###
+----------------------+--------------------+-------------+
| Frase Chiave         |   Punteggio_Totale |   Frequenza |
|----------------------+--------------------+-------------|
| great location       |           1184.25  |         290 |
| f

In [97]:
# 2. Definizione di Aspetti e Dizionari di Sentimento (Opinion Lexicons)

# Aspetti estratti dalle frasi chiave precedenti
ASPECTS = {
    'STAFF': ['staff', 'desk', 'service', 'personnel', 'employee', 'host', 'manager'],
    'LOCATION': ['location', 'place', 'downtown', 'area', 'neighborhood', 'street'],
    'ROOM': ['room', 'suite', 'bathroom', 'bed', 'shower']
}

# Dizionario di Sentimento
POSITIVE_WORDS = ['great', 'excellent', 'fantastic', 'friendly', 'wonderful', 'clean', 'good', 'perfect', 'nice', 'helpful', 'awesome']
NEGATIVE_WORDS = ['rude', 'terrible', 'dirty', 'poor', 'bad', 'slow', 'horrible', 'disappointed', 'awful', 'loud', 'noise']

# 3. Funzione per Eseguire ABSA basato su Regole (Proximity Analysis)
# Analizza se una parola di sentimento si trova vicino a un aspetto,
# e assegna la polarità a quell'aspetto per la recensione.
def analyze_absa_rule_based(review):
    results = defaultdict(lambda: 'Neutro')
    # Pulizia preliminare e tokenizzazione (divisione in parole)
    review_lower = review.lower()
    
    # Rimuovi la punteggiatura e dividi la recensione in token (parole)
    tokens = re.findall(r'\b\w+\b', review_lower)

    for aspect_category, aspect_keywords in ASPECTS.items():
        # Trova la posizione (indice) di tutte le parole di interesse
        aspect_indices = [i for i, token in enumerate(tokens) if token in aspect_keywords]
        pos_indices = [i for i, token in enumerate(tokens) if token in POSITIVE_WORDS]
        neg_indices = [i for i, token in enumerate(tokens) if token in NEGATIVE_WORDS]
        
        # Proximity Check (controlla se una parola di sentimento è vicina a una parola aspetto)
        # Definiamo la vicinanza come 5 token di distanza
        proximity_limit = 5 
        
        found_sentiment = False
        
        # Check Positivo
        for a_idx in aspect_indices:
            for p_idx in pos_indices:
                if abs(a_idx - p_idx) <= proximity_limit:
                    results[aspect_category] = 'Positivo'
                    found_sentiment = True
                    break
            if found_sentiment:
                break
        
        # Check Negativo (Sovrascrive il Positivo se più vicino o se trovato dopo)
        for a_idx in aspect_indices:
            for n_idx in neg_indices:
                if abs(a_idx - n_idx) <= proximity_limit:
                    # Se c'è un sentimento negativo vicino, sovrascrivi o decidi la polarità
                    # Qui, diamo la precedenza al negativo in caso di conflitto vicino.
                    results[aspect_category] = 'Negativo'
                    break
            if results[aspect_category] == 'Negativo':
                break

    return results

# 4. Applicazione dell'Analisi (Limita a 5000 recensioni per velocità)
df_sample = df.copy()
df_sample['ABSA_Results'] = df_sample['Review'].apply(analyze_absa_rule_based)

# 5. Aggregazione dei Risultati per la Visualizzazione
absa_summary = {
    'Aspect': [],
    'Positive': [],
    'Negative': [],
    'Neutral': []
}

for aspect in ASPECTS.keys():
    counts = df_sample['ABSA_Results'].apply(lambda x: x[aspect]).value_counts(normalize=True).mul(100).round(1)
    
    absa_summary['Aspect'].append(aspect)
    absa_summary['Positive'].append(counts.get('Positivo', 0.0))
    absa_summary['Negative'].append(counts.get('Negativo', 0.0))
    absa_summary['Neutral'].append(counts.get('Neutro', 0.0))
    
absa_df = pd.DataFrame(absa_summary)
absa_df['Total'] = absa_df['Positive'] + absa_df['Negative'] + absa_df['Neutral']


# 6. Output
print("### ABSA: Analisi del Sentimento Basata sull'Aspetto (Regole di Prossimità) ###")
print("Percentuale di recensioni che esprimono un sentimento (Positivo/Negativo) per l'aspetto:")
print(absa_df.to_string(index=False))

### ABSA: Analisi del Sentimento Basata sull'Aspetto (Regole di Prossimità) ###
Percentuale di recensioni che esprimono un sentimento (Positivo/Negativo) per l'aspetto:
  Aspect  Positive  Negative  Neutral  Total
   STAFF      52.8       8.0     39.2  100.0
LOCATION      43.3       6.6     50.0   99.9
    ROOM      41.7       9.3     49.0  100.0


1. Punto di Forza (Engagement): Lo STAFF è l'aspetto più discusso emotivamente (solo il $39.2\%$ delle recensioni è rimasto neutro).La polarità è schiacciante: il $52.8\%$ delle opinioni rilevate è Positiva, confermando che il personale è un punto di forza.

2. Punto di Debolezza (Criticità) - ROOM:
Sebbene le opinioni positive ($41.7\%$) siano quattro volte più numerose delle negative, la percentuale di sentimento Negativo ($9.3\%$) per le ROOM è la più alta tra tutti gli aspetti, indicando che i problemi più frequenti e critici (sporcizia, rumore, guasti) si concentrano in quest'area.

3. Contesto (LOCATION)La LOCATION è un forte punto di vendita (43.3% Positivo), ma anche l'aspetto con percentuale di recensioni Neutre ($50\%$), suggerendo che, per molti, la posizione è un dato di fatto e non suscita emozioni forti.

In [8]:
X_train_text, X_test_text, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# addestra modello fastText non supervisionato sui testi di train
with open("train_texts.txt", "w", encoding="utf-8") as f:
    for t in X_train_text:
        f.write(t.replace("\n", " ") + "\n")

ft_model = fasttext.train_unsupervised("train_texts.txt", model="skipgram",dim=100)

def doc_to_vec(text, model, dim=100):
    words = text.split()
    vecs = [model.get_word_vector(w) for w in words]
    if len(vecs) == 0:
        return np.zeros(dim)
    return np.mean(vecs, axis=0)

dim = 100
X_train_ft = np.vstack([doc_to_vec(t, ft_model, dim) for t in X_train_text])
X_test_ft  = np.vstack([doc_to_vec(t, ft_model, dim) for t in X_test_text])

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_ft, y_train)

print("Dopo SMOTE:", Counter(y_train_res))


Read 1M words
Number of words:  17146
Number of labels: 0
Progress: 100.0% words/sec/thread:   47574 lr:  0.000000 avg.loss:  2.277305 ETA:   0h 0m 0s  3.9% words/sec/thread:   29640 lr:  0.048029 avg.loss:  2.510404 ETA:   0h 0m39s


Dopo SMOTE: Counter({2: 12074, 0: 12074, 1: 12074})


In [9]:

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_res, y_train_res)
y_pred = clf.predict(X_test_ft)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.79      0.78       643
           1       0.29      0.58      0.39       437
           2       0.95      0.81      0.87      3019

    accuracy                           0.78      4099
   macro avg       0.67      0.73      0.68      4099
weighted avg       0.85      0.78      0.81      4099



In [10]:

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_res, y_train_res)
y_pred = clf.predict(X_test_ft)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.79      0.78       643
           1       0.29      0.58      0.39       437
           2       0.95      0.81      0.87      3019

    accuracy                           0.78      4099
   macro avg       0.67      0.73      0.68      4099
weighted avg       0.85      0.78      0.81      4099



In [11]:

print("Prima:", Counter(y_train))

rus = RandomUnderSampler(
    sampling_strategy={2: 6000},  # tieni 8000 esempi della classe 2
    random_state=42
)
X_train_under, y_train_under = rus.fit_resample(X_train_ft, y_train)
print("Dopo undersampling:", Counter(y_train_under))


Prima: Counter({2: 12074, 0: 2571, 1: 1747})
Dopo undersampling: Counter({2: 6000, 0: 2571, 1: 1747})


In [12]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(
    sampling_strategy={0: 6000, 1: 6000, 2: 6000},  # target finale
    random_state=42
)
X_train_bal, y_train_bal = sm.fit_resample(X_train_under, y_train_under)

from collections import Counter
print("Dopo undersampling + SMOTE:", Counter(y_train_bal))


Dopo undersampling + SMOTE: Counter({0: 6000, 1: 6000, 2: 6000})


In [13]:
# 3) Riaddestri il classificatore
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_bal, y_train_bal)

# 4) Test sullo stesso test set di prima
y_pred = clf.predict(X_test_ft)
print(classification_report(y_test, y_pred,digits=3))

              precision    recall  f1-score   support

           0      0.766     0.796     0.781       643
           1      0.280     0.590     0.380       437
           2      0.957     0.796     0.869      3019

    accuracy                          0.774      4099
   macro avg      0.668     0.727     0.677      4099
weighted avg      0.855     0.774     0.803      4099



In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Logistic Regression
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "class_weight": [None, "balanced"]
}
lr = LogisticRegression(max_iter=2000)
grid_lr = GridSearchCV(
    lr, param_grid_lr,
    cv=3, scoring="f1_macro", n_jobs=-1
)
grid_lr.fit(X_train_bal, y_train_bal)
print("Best LR params:", grid_lr.best_params_)

# Linear SVM
param_grid_svm = {
    "C": [0.01, 0.1, 1, 10],
    "class_weight": [None, "balanced"]
}
svm = LinearSVC()
grid_svm = GridSearchCV(
    svm, param_grid_svm,
    cv=3, scoring="f1_macro", n_jobs=-1
)
grid_svm.fit(X_train_bal, y_train_bal)
print("Best SVM params:", grid_svm.best_params_)


Best LR params: {'C': 10, 'class_weight': None, 'penalty': 'l2'}
Best SVM params: {'C': 10, 'class_weight': 'balanced'}


In [15]:
best_lr = LogisticRegression(
    C=10,
    class_weight=None,
    penalty="l2"
)
best_lr.fit(X_train_bal, y_train_bal)

best_svm = LinearSVC(
    C=1,
    class_weight="balanced"
)
best_svm.fit(X_train_bal, y_train_bal)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [16]:
from sklearn.metrics import classification_report

print("=== LR - FastText ===")
print(classification_report(y_test, best_lr.predict(X_test_ft)))

print("=== SVM - FastText ===")
print(classification_report(y_test, best_svm.predict(X_test_ft)))


=== LR - FastText ===
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       643
           1       0.29      0.59      0.38       437
           2       0.96      0.80      0.87      3019

    accuracy                           0.78      4099
   macro avg       0.67      0.73      0.68      4099
weighted avg       0.86      0.78      0.81      4099

=== SVM - FastText ===
              precision    recall  f1-score   support

           0       0.72      0.81      0.76       643
           1       0.29      0.52      0.37       437
           2       0.95      0.81      0.88      3019

    accuracy                           0.78      4099
   macro avg       0.65      0.72      0.67      4099
weighted avg       0.84      0.78      0.81      4099



In [18]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

# 1. Split
X_train_text_w2v, X_test_text_w2v, y_train_w2v, y_test_w2v = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)


# 2. Tokenization for W2V
train_tokens = [t.split() for t in X_train_text_w2v]

# 3. Train Word2Vec
w2v_model = Word2Vec(
    train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)

def doc_to_vec_w2v(text, model, dim=100):
    words = text.split()
    vecs = [model.wv[w] for w in words if w in model.wv]
    if len(vecs) == 0:
        return np.zeros(dim)
    return np.mean(vecs, axis=0)

# 4. Create embeddings
dim = 100
X_train_w2v = np.vstack([doc_to_vec_w2v(t, w2v_model, dim) for t in X_train_text_w2v])
X_test_w2v  = np.vstack([doc_to_vec_w2v(t, w2v_model, dim) for t in X_test_text_w2v])

# 5. SMOTE
sm = SMOTE(random_state=42)
X_train_res_w2v, y_train_res_w2v = sm.fit_resample(X_train_w2v, y_train_w2v)

print("Dopo SMOTE:", Counter(y_train_res_w2v))

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Dopo SMOTE: Counter({2: 12074, 0: 12074, 1: 12074})


In [19]:
print("Prima:", Counter(y_train_w2v))

rus = RandomUnderSampler(
    sampling_strategy={2: 6000},  # tieni 6000 esempi della classe 2
    random_state=42
)
X_train_under_w2v, y_train_under_w2v = rus.fit_resample(X_train_w2v, y_train_w2v)
print("Dopo undersampling:", Counter(y_train_under_w2v))

Prima: Counter({2: 12074, 0: 2571, 1: 1747})
Dopo undersampling: Counter({2: 6000, 0: 2571, 1: 1747})


In [20]:
X_train_bal_w2v, y_train_bal_w2v = sm.fit_resample(X_train_under_w2v, y_train_under_w2v)

from collections import Counter
print("Dopo undersampling + SMOTE:", Counter(y_train_bal_w2v))

Dopo undersampling + SMOTE: Counter({0: 6000, 1: 6000, 2: 6000})


In [21]:
# 3) Riaddestri il classificatore
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_bal_w2v, y_train_bal_w2v)

# 4) Test sullo stesso test set di prima
y_pred = clf.predict(X_test_w2v)
print(classification_report(y_test_w2v , y_pred,digits=3))

              precision    recall  f1-score   support

           0      0.682     0.748     0.714       643
           1      0.249     0.551     0.343       437
           2      0.948     0.762     0.845      3019

    accuracy                          0.737      4099
   macro avg      0.626     0.687     0.634      4099
weighted avg      0.832     0.737     0.771      4099



In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Logistic Regression
param_grid_lr_w2v = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "class_weight": [None, "balanced"]
}
lr_w2v = LogisticRegression(max_iter=2000)
grid_lr_w2v = GridSearchCV(
    lr_w2v, param_grid_lr_w2v,
    cv=3, scoring="f1_macro", n_jobs=-1
)
grid_lr_w2v.fit(X_train_bal_w2v, y_train_bal_w2v)
print("Best LR params:", grid_lr_w2v.best_params_)

# Linear SVM
param_grid_svm_w2v = {
    "C": [0.01, 0.1, 1, 10],
    "class_weight": [None, "balanced"]
}
svm_w2v = LinearSVC()
grid_svm_w2v = GridSearchCV(
    svm_w2v, param_grid_svm_w2v,
    cv=3, scoring="f1_macro", n_jobs=-1
)
grid_svm_w2v.fit(X_train_bal_w2v, y_train_bal_w2v)
print("Best SVM params:", grid_svm_w2v.best_params_)

Best LR params: {'C': 10, 'class_weight': None, 'penalty': 'l2'}
Best SVM params: {'C': 10, 'class_weight': 'balanced'}


In [24]:
best_lr_w2v = LogisticRegression(
    C=10,
    class_weight=None,
    penalty="l2"
)
best_lr_w2v.fit(X_train_bal_w2v, y_train_bal_w2v)

best_svm_w2v = LinearSVC(
    C=10,
    class_weight='balanced'
)
best_svm_w2v.fit(X_train_bal_w2v, y_train_bal_w2v)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,10
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [25]:
print("=== LR - Word2Vec ===")
print(classification_report(y_test_w2v, best_lr_w2v.predict(X_test_w2v)))

print("=== SVM - Word2Vec ===")
print(classification_report(y_test_w2v, best_svm_w2v.predict(X_test_w2v)))

=== LR - Word2Vec ===
              precision    recall  f1-score   support

           0       0.68      0.74      0.71       643
           1       0.24      0.53      0.33       437
           2       0.94      0.76      0.84      3019

    accuracy                           0.73      4099
   macro avg       0.62      0.68      0.63      4099
weighted avg       0.83      0.73      0.77      4099

=== SVM - Word2Vec ===
              precision    recall  f1-score   support

           0       0.65      0.77      0.70       643
           1       0.26      0.47      0.33       437
           2       0.94      0.79      0.86      3019

    accuracy                           0.75      4099
   macro avg       0.61      0.68      0.63      4099
weighted avg       0.82      0.75      0.78      4099



In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


max_words = 10000  # dimensione vocabolario
max_len   = 100    # lunghezza massima della sequenza

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

X_seq = tokenizer.texts_to_sequences(texts)
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post', truncating='post')


le = LabelEncoder()
y_enc = le.fit_transform(labels)
y_train_int = y_enc



In [40]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

embedding_dim = 100  # dipende dal tuo embedding
n_classes = len(np.unique(y_enc))

# Matrice embedding: parola_index → vettore
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    if word in ft_model.words:  # FastText
        embedding_matrix[i] = ft_model.get_word_vector(word)

        
model = Sequential()
model.add(Embedding(input_dim=max_words,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))  # non aggiornare embeddings
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(n_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'])




In [42]:
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

X_train, X_val, y_train_int, y_val_int = train_test_split(
    X_pad, y_enc, test_size=0.1, random_state=42, stratify=y_enc
)


classes = np.unique(y_train_int)
cw = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train_int
)

# Trasforma in dizionario {classe: peso}
class_weight_dict = dict(zip(classes, cw))
print(class_weight_dict)

history = model.fit(
    X_train, y_train_int,
    validation_data=(X_val, y_val_int),
    epochs=10,
    batch_size=128,
    class_weight=class_weight_dict,
    #callbacks=[es, mc],
    verbose=2
)

{0: 2.12551867219917, 1: 3.126653102746694, 2: 0.4525509828462048}
Epoch 1/10
145/145 - 33s - 226ms/step - accuracy: 0.7281 - loss: 0.8971 - val_accuracy: 0.7995 - val_loss: 0.6905
Epoch 2/10
145/145 - 32s - 223ms/step - accuracy: 0.7509 - loss: 0.8605 - val_accuracy: 0.8137 - val_loss: 0.6411
Epoch 3/10
145/145 - 39s - 268ms/step - accuracy: 0.7610 - loss: 0.8490 - val_accuracy: 0.7907 - val_loss: 0.6813
Epoch 4/10
145/145 - 36s - 248ms/step - accuracy: 0.7365 - loss: 0.8456 - val_accuracy: 0.7322 - val_loss: 0.7169
Epoch 5/10
145/145 - 39s - 266ms/step - accuracy: 0.7244 - loss: 0.8254 - val_accuracy: 0.7976 - val_loss: 0.7015
Epoch 6/10
145/145 - 41s - 282ms/step - accuracy: 0.7288 - loss: 0.8267 - val_accuracy: 0.8073 - val_loss: 0.6072
Epoch 7/10
145/145 - 41s - 282ms/step - accuracy: 0.6123 - loss: 0.8395 - val_accuracy: 0.7990 - val_loss: 0.8327
Epoch 8/10
145/145 - 40s - 277ms/step - accuracy: 0.7102 - loss: 0.8227 - val_accuracy: 0.8005 - val_loss: 0.5867
Epoch 9/10
145/145 - 

In [45]:
from sklearn.metrics import classification_report
import numpy as np

# 1. Predizioni del modello (softmax output)
y_pred_prob = model.predict(X_test_w2v)

# 2. Converti in label intere (indice della classe con probabilità massima)
y_pred_int = np.argmax(y_pred_prob, axis=1)

# 3. Classification report
print(classification_report(y_val_int, y_pred_int, digits=4))


[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step


ValueError: Found input variables with inconsistent numbers of samples: [2050, 4099]

In [24]:
try:
    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer"])
    SPACY_LOADED = True
except OSError:
    print("ERRORE: Modello SpaCy 'en_core_web_sm' non trovato. Saltando l'analisi NER.")
    SPACY_LOADED = False
    
if SPACY_LOADED:
    
    # 3. Funzione per Eseguire la NER
    def extract_entities_pipe(texts):
        results = []
        for doc in nlp.pipe(texts, batch_size=50, n_process=-1):
            entities = []
            for ent in doc.ents:
                if ent.label_ in ['ORG', 'GPE', 'LOC', 'FAC']:
                    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', ent.text).strip()
                    if cleaned_text:
                        entities.append((cleaned_text, ent.label_))
            results.append(entities)
        return results



    # 4. Applicazione della NER (Limita a 2000 recensioni per velocità)
    df['Entities'] = extract_entities_pipe(df['Review'].astype(str))

    # 5. Aggregazione dei Risultati
    all_entities = [ent for sublist in df['Entities'] for ent in sublist]
    
    # Contiamo le entità per tipo
    entity_type_counts = Counter([label for text, label in all_entities])
    
    # Contiamo le top 10 entità in generale
    entity_text_counts = Counter([text for text, label in all_entities])
    
    
    # 6. Output dei Risultati
    print("### Named Entity Recognition (NER) Risultati ###")
    
    print("\nTop 5 Tipi di Entità Trovate:")
    entity_type_df = pd.DataFrame(entity_type_counts.most_common(5), columns=['Tipo Entità', 'Frequenza'])
    print(entity_type_df.to_string(index=False))
    
    print("\nTop 10 Entità Nominate (Generalizzate):")
    # Escludiamo Stop Words/Nomi Comuni per ottenere veri nomi
    top_entities = [(text, count) for text, count in entity_text_counts.most_common(50) if text.lower() not in ['hotel', 'room', 'staff', 'us', 'place', 'day', 'night', 'time']]
    top_entities_df = pd.DataFrame(top_entities[:10], columns=['Nome Entità', 'Frequenza'])

    print(top_entities_df.to_string(index=False))

    print("\nTop 5 Entità per Tipo:\n")

stop_entities = {'hotel', 'room', 'staff', 'us', 'place', 'day', 'night', 'time','inn','can'}

"""
for entity_type in ['ORG', 'GPE', 'LOC', 'FAC']:
    
    # Filtra solo le entità di quel tipo
    filtered = [
        text for text, label in all_entities
        if label == entity_type and text.lower() not in stop_entities
    ]
    
    if not filtered:
        continue

    counts = Counter(filtered)
    top_5 = counts.most_common(5)

    print(f"▶ {entity_type}")
    for text, count in top_5:
        print(f"  - {text}: {count}")
    print()
"""

rows = []

for entity_type in ['PERSON', 'ORG', 'GPE', 'LOC', 'FAC']:
    filtered = [
        text for text, label in all_entities
        if label == entity_type and text.lower() not in stop_entities
    ]
    counts = Counter(filtered)
    for text, count in counts.most_common(5):
        rows.append((entity_type, text, count))

top_by_type_df = pd.DataFrame(
    rows, columns=['Tipo Entità', 'Nome Entità', 'Frequenza']
)

print(top_by_type_df.to_string(index=False))

### Named Entity Recognition (NER) Risultati ###

Top 5 Tipi di Entità Trovate:
Tipo Entità  Frequenza
        GPE      14519
        ORG       8423
        FAC       1828
        LOC       1240

Top 10 Entità Nominate (Generalizzate):
  Nome Entità  Frequenza
        paris       1164
        metro        894
       london        719
     san juan        694
     new york        625
san francisco        508
    hong kong        412
  new orleans        398
       europe        385
        tokyo        369

Top 5 Entità per Tipo:

Tipo Entità        Nome Entità  Frequenza
        ORG             westin        252
        ORG               ritz        179
        ORG                 ac        105
        ORG                cnn         77
        ORG        continental         75
        GPE              paris       1161
        GPE             london        719
        GPE           san juan        684
        GPE           new york        625
        GPE      san francisco        505
  

In [28]:
nlp = spacy.load("en_core_web_sm")

all_keywords_counter = Counter()

for text in df['Review'].astype(str):
    doc = textacy.make_spacy_doc(text, lang=nlp)
    keywords = textacy.extract.keyterms.textrank(doc, topn=10)
    for term, score in keywords:
        all_keywords_counter[term] += score

top_5_global = all_keywords_counter.most_common(5)
print(top_5_global)


[('great hotel', 23.526087952468885), ('great location', 15.060921710538748), ('star hotel', 14.152137437381384), ('good hotel', 12.596507377714472), ('nice hotel', 11.192925535663923)]
