Basic Model

In [25]:
import pandas as pd
import numpy as np
import os
import re

In [26]:
df_en = pd.read_csv("artifacts/preprocessed_comments_all_eng.csv")
print("Saved clean dataset:", df_en.shape)

Saved clean dataset: (1166207, 6)


In [27]:
#segmentujacja na zdania

import re

def simple_sentence_split(text: str):
    parts = re.split(r'(?<=[.!?])\s+|\n+', str(text))
    parts = [p.strip() for p in parts if p and p.strip()]
    return parts

df_en["sentences"] = df_en["text_clean"].apply(simple_sentence_split)
df_sent = df_en.explode("sentences").rename(columns={"sentences":"sentence"}).reset_index(drop=True)

# filtr ultra kr√≥tkich zda≈Ñ
df_sent = df_sent[df_sent["sentence"].str.len() >= 10].reset_index(drop=True)
df_sent[["listing_id","sentence","date"]].head(10), df_sent.shape


(   listing_id                                           sentence        date
 0        2737  this spot was in a great and nice area,walking...  2014-12-26
 1        2737     the room, that i rented, was clean and warm :)  2014-12-26
 2        2737  elif was really very helpful, while arranging ...  2015-01-04
 3        2737  the room was clean and cosy, with all the furn...  2015-01-04
 4        2737  picchioni was really helpful and helped me to ...  2015-01-04
 5        2737  elif's flat is spacious, clean, and had everyt...  2015-04-12
 6        2737  i did not get to meet elif, but she was patien...  2015-04-12
 7        2737  i met daniele, her husband, who clearly loves ...  2015-04-12
 8        2737  excellent hosts that made me want to live in i...  2015-04-12
 9        2737  i had a wonderful time staying with elif and h...  2015-04-22,
 (4885977, 7))

In [28]:
df_doc = df_sent[["listing_id", "sentence", "date"]].copy()

df_doc["sentence"] = df_doc["sentence"].astype(str).str.strip()
df_doc = df_doc[df_doc["sentence"].str.len() >= 10].copy()
df_doc.shape, df_doc.head(5)


((4177443, 3),
    listing_id                                           sentence        date
 0        2737  this spot was in a great and nice area,walking...  2014-12-26
 1        2737     the room, that i rented, was clean and warm :)  2014-12-26
 2        2737  elif was really very helpful, while arranging ...  2015-01-04
 3        2737  the room was clean and cosy, with all the furn...  2015-01-04
 4        2737  picchioni was really helpful and helped me to ...  2015-01-04)

In [29]:
# N_SENT = 200_000  # na start
# df_doc_small = df_doc.sample(n=min(N_SENT, len(df_doc)), random_state=42).reset_index(drop=True)
# df_doc_small.shape
df_doc_small = df_doc

In [30]:
import os
os.makedirs("artifacts", exist_ok=True)
df_doc_small.to_csv("artifacts/sentences_for_aspects_all_eng.csv", index=False)

Read Data

In [1]:
import pandas as pd

# SAMPLE_PERCENT = 30 
SAMPLE_PERCENT = 100 

df_doc_full = pd.read_csv("artifacts/sentences_for_aspects_all_eng.csv")

if SAMPLE_PERCENT < 100:
    n_sample = int(len(df_doc_full) * SAMPLE_PERCENT / 100)
    df_doc_small = df_doc_full.sample(n=n_sample, random_state=42).reset_index(drop=True)
    print(f"Used {SAMPLE_PERCENT}% of data: {len(df_doc_small)} out of {len(df_doc_full)}")
else:
    df_doc_small = df_doc_full
    print(f"Used all data: {len(df_doc_small)} rows")

df_doc_small.head(5)
print(df_doc_small.shape)


Used all data: 4177443 rows
(4177443, 3)


Wektoryzacja

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=25000,
    ngram_range=(1, 2),
    min_df=5,
    stop_words="english"
)

X_tfidf = tfidf.fit_transform(df_doc_small["sentence"])
print(f"Dim of  TF-IDF: {X_tfidf.shape}")
print(f"Number of unique features: {len(tfidf.get_feature_names_out())}")

Dim of  TF-IDF: (4177443, 25000)
Number of unique features: 25000


In [3]:
from sklearn.cluster import KMeans
import time

K = 22

print(f"Train K-Means with {K} clusters")
start = time.time()

kmeans = KMeans(
    n_clusters=K,
    random_state=42,
    n_init=10,
    max_iter=1000,
    verbose=0
)

df_doc_small["cluster"] = kmeans.fit_predict(X_tfidf)
print(f"Training completed: {time.time() - start:.2f}s")
print(f"\nCluster distribution:")
print(df_doc_small["cluster"].value_counts().sort_index())

Train K-Means with 22 clusters
Training completed: 1079.11s

Cluster distribution:
cluster
0      119500
1       82837
2      139716
3      193006
4      199276
5       64121
6      126231
7      272016
8       54208
9      135087
10      68050
11      66504
12     103598
13      84838
14     167102
15      69423
16      87294
17    1565984
18     153030
19      84162
20     123819
21     217641
Name: count, dtype: int64


In [4]:
for c in range(K):
    print(f"CLUSTER {c} ({len(df_doc_small[df_doc_small['cluster'] == c])} sentences)")
    cluster_data = df_doc_small[df_doc_small["cluster"] == c]
    if len(cluster_data) > 0:
        examples = cluster_data["sentence"].sample(min(10, len(cluster_data)), random_state=42)
        for i, s in enumerate(examples, 1):
            print(f"{i}. {s}")

CLUSTER 0 (119500 sentences)
1. roberto‚Äôs place was very comfortable, it had everything you could need.
2. we found the apartment to be clean, comfortable and quite large for roma.
3. and the apartment was clean, comfortable, and felt perfect for our family.
4. the living space was wonderful: very clean and comfortable.
5. the apartment was clean, modern, comfortable and everything was very nice.
6. the apartment is super clean, very nicely designed and very comfortable to stay.
7. this can be due to the weather conditions but it was not really comfortable.
8. apartment spacious and comfortable.
9. it is a lovely apartment that is well furnished, spacious, very comfortable and in a quiet neighbourhood.
10. central location and comfortable space.
CLUSTER 1 (82837 sentences)
1. the apartment is beautiful and bed comfortable.
2. it was beautiful and quiet and so clean.
3. beautiful place, nicely decorated.
4. the place was clean, beautiful and safe.
5. it's a beautiful place to stay dur

In [5]:
import pandas as pd

CLUSTER_TO_ASPECT = {
    17: None, 
    3: "overall_positive",
    4: "overall_positive",
    10: "overall_positive",
    13: "overall_positive",
    15: "overall_positive", 
    20: "overall_positive", 
    21: "overall_positive", 
    8: "overall_positive",
    
    0: "cleanliness", 
    18: "cleanliness", 
    2: "dining_entertainment",
    6: "walkability",
    19: "walkability",
    16: "transport_connectivity",
    9: "location_close_center", 
    11: "apartment_capacity",
    1: "apartment_appearance",
    5: "atmosphere_ambiance",
    7: "host_and_property", 
    14: "host_communication", 
    12: "arrival_check_in_experience",
}

df_doc_small["aspect"] = df_doc_small["cluster"].map(CLUSTER_TO_ASPECT)
df_doc_small_clean = df_doc_small[df_doc_small["aspect"].notna()].copy()
aspect_counts = df_doc_small_clean["aspect"].value_counts().sort_values(ascending=False)
print(aspect_counts)

print(f"\nTotal aspects: {df_doc_small_clean['aspect'].nunique()}")
print(f"Total sentences kept: {len(df_doc_small_clean):,} ({len(df_doc_small_clean)/len(df_doc_small)*100:.1f}%)")

aspect
overall_positive               1010261
cleanliness                     272530
host_and_property               272016
walkability                     210393
host_communication              167102
dining_entertainment            139716
location_close_center           135087
arrival_check_in_experience     103598
transport_connectivity           87294
apartment_appearance             82837
apartment_capacity               66504
atmosphere_ambiance              64121
Name: count, dtype: int64

Total aspects: 12
Total sentences kept: 2,611,459 (62.5%)


In [6]:
df_doc_small.to_csv("artifacts/BASE_sentences_for_aspects_labeled_eng.csv", index=False)

Analiza sentymentu zda≈Ñ

In [7]:
import pandas as pd
df_doc_small = pd.read_csv("artifacts/BASE_sentences_for_aspects_labeled_eng.csv")


In [8]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm

analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    score = analyzer.polarity_scores(str(text))["compound"]
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

print("VADER sentiment analysis")
tqdm.pandas(desc="VADER sentiment")
df_doc_small["sentiment"] = df_doc_small["sentence"].progress_apply(vader_sentiment)

print("\nSentiment distribution:")
print(df_doc_small["sentiment"].value_counts(normalize=True))
df_doc_small.to_csv("artifacts/BASE_sentences_for_aspects_eng_WITH_ASPECTS.csv", index=False)


VADER sentiment analysis


VADER sentiment: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4177443/4177443 [06:34<00:00, 10587.28it/s]



Sentiment distribution:
sentiment
positive    0.757152
neutral     0.192300
negative    0.050547
Name: proportion, dtype: float64


Pivot

In [9]:
baseline_agg = (
    df_doc_small
    .dropna(subset=["aspect"])
    .groupby(["listing_id", "aspect", "sentiment", "date"])
    .size()
    .reset_index(name="count")
)

print(f"Baseline agregacja: {baseline_agg.shape}")
baseline_agg.head(10)

Baseline agregacja: (2254453, 5)


Unnamed: 0,listing_id,aspect,sentiment,date,count
0,2737,cleanliness,positive,2015-01-04,1
1,2737,cleanliness,positive,2015-04-22,1
2,2737,host_and_property,positive,2015-04-22,1
3,2737,host_communication,positive,2015-01-04,2
4,2737,walkability,positive,2014-12-26,1
5,3079,apartment_appearance,positive,2012-09-17,1
6,3079,arrival_check_in_experience,neutral,2012-04-13,1
7,3079,arrival_check_in_experience,positive,2014-06-03,1
8,3079,cleanliness,positive,2015-09-28,1
9,3079,dining_entertainment,negative,2013-10-14,1


In [10]:
baseline_pivot = (
    baseline_agg
    .pivot_table(
        index=["listing_id", "aspect", "date"],
        columns="sentiment",
        values="count",
        fill_value=0
    )
    .reset_index()
)

for col in ["positive", "neutral", "negative"]:
    if col not in baseline_pivot.columns:
        baseline_pivot[col] = 0

baseline_pivot["score"] = (
    baseline_pivot["positive"]
    - baseline_pivot["negative"]
)

baseline_pivot["total_mentions"] = (
    baseline_pivot["positive"]
    + baseline_pivot["neutral"]
    + baseline_pivot["negative"]
)

print(f"Baseline pivot: {baseline_pivot.shape}")
baseline_pivot.head(10)

Baseline pivot: (2150670, 8)


sentiment,listing_id,aspect,date,negative,neutral,positive,score,total_mentions
0,2737,cleanliness,2015-01-04,0.0,0.0,1.0,1.0,1.0
1,2737,cleanliness,2015-04-22,0.0,0.0,1.0,1.0,1.0
2,2737,host_and_property,2015-04-22,0.0,0.0,1.0,1.0,1.0
3,2737,host_communication,2015-01-04,0.0,0.0,2.0,2.0,2.0
4,2737,walkability,2014-12-26,0.0,0.0,1.0,1.0,1.0
5,3079,apartment_appearance,2012-09-17,0.0,0.0,1.0,1.0,1.0
6,3079,arrival_check_in_experience,2012-04-13,0.0,1.0,0.0,0.0,1.0
7,3079,arrival_check_in_experience,2014-06-03,0.0,0.0,1.0,1.0,1.0
8,3079,cleanliness,2015-09-28,0.0,0.0,1.0,1.0,1.0
9,3079,dining_entertainment,2013-10-14,1.0,0.0,0.0,-1.0,1.0


In [11]:
def ranking_for_listing(df, listing_id, top_k=3):
    sub = df[df["listing_id"] == listing_id]
    return (
        sub.sort_values("score", ascending=False).head(top_k),
        sub.sort_values("score", ascending=True).head(top_k)
    )

In [12]:
listing = df_doc_small["listing_id"].sample(1).iloc[0]
ranking_for_listing(baseline_pivot, listing)

(sentiment  listing_id            aspect        date  negative  neutral  \
 755703       10981189  overall_positive  2022-06-20       0.0      0.0   
 755555       10981189  overall_positive  2017-01-10       0.0      1.0   
 755598       10981189  overall_positive  2018-03-12       0.0      1.0   
 
 sentiment  positive  score  total_mentions  
 755703          6.0    6.0             6.0  
 755555          5.0    5.0             6.0  
 755598          5.0    5.0             6.0  ,
 sentiment  listing_id             aspect        date  negative  neutral  \
 755316       10981189  host_and_property  2016-09-08       1.0      0.0   
 755360       10981189  host_and_property  2019-10-22       1.0      0.0   
 755686       10981189   overall_positive  2021-12-19       2.0      0.0   
 
 sentiment  positive  score  total_mentions  
 755316          0.0   -1.0             1.0  
 755360          0.0   -1.0             1.0  
 755686          1.0   -1.0             3.0  )

In [13]:
# Save pivot tables
import os
os.makedirs("artifacts/ab_test", exist_ok=True)

baseline_pivot.to_csv("artifacts/ab_test/model_baseline.csv", index=False)

print("Saved model data for A/B testing:")
print("Model Baseline: artifacts/ab_test/model_baseline.csv")

Saved model data for A/B testing:
Model Baseline: artifacts/ab_test/model_baseline.csv


In [14]:
import os
pd.read_csv("artifacts/ab_test/model_baseline.csv").head()


Unnamed: 0,listing_id,aspect,date,negative,neutral,positive,score,total_mentions
0,2737,cleanliness,2015-01-04,0.0,0.0,1.0,1.0,1.0
1,2737,cleanliness,2015-04-22,0.0,0.0,1.0,1.0,1.0
2,2737,host_and_property,2015-04-22,0.0,0.0,1.0,1.0,1.0
3,2737,host_communication,2015-01-04,0.0,0.0,2.0,2.0,2.0
4,2737,walkability,2014-12-26,0.0,0.0,1.0,1.0,1.0
