In [1]:
# Run the numeric dataset creation and inspect results
from IPython.display import display
import pandas as pd

from ium_long_stay_patterns.src.helpers.create_numerical_dataset import create_numerical_dataset, merge_with_stats
from ium_long_stay_patterns.config import ProcessedCSV

[32m2026-01-03 18:00:08.097[0m | [1mINFO    [0m | [36mium_long_stay_patterns.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: /home/matimat/IUM/ium-long-stay-patterns[0m


In [5]:
# Create numeric-only dataframe
df_numeric = create_numerical_dataset(ProcessedCSV.LISTINGS.path)

print("Shape:", df_numeric.shape)
display(df_numeric.head())


Shape: (1464, 19)


Unnamed: 0,id,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,instant_bookable,calculated_host_listings_count,reviews_per_month
0,30419466,135482103,0.86,1.0,1,20,24,2,37.97251,23.72772,10,3.0,4.0,7.0,432.0,181,1,12,2.51
1,49982681,31290848,1.0,1.0,0,109,119,3,37.97536,23.73172,2,1.0,1.0,2.0,72.0,22,1,70,0.53
2,48800718,113548208,1.0,1.0,0,237,241,3,37.96341,23.71779,7,2.5,3.0,3.0,151.0,138,1,90,3.08
3,1101264403993187936,118181401,1.0,1.0,1,8,9,2,37.98059,23.71446,7,3.5,3.0,5.0,600.0,35,1,8,3.56
4,883790710439607743,4899687,1.0,0.99,1,3,3,2,37.978738,23.73873,4,2.0,2.0,2.0,207.0,74,1,1,3.76


In [8]:
print(df_numeric.head(1))

         id    host_id  host_response_rate  host_acceptance_rate  \
0  30419466  135482103                0.86                   1.0   

   host_is_superhost  host_listings_count  host_total_listings_count  \
0                  1                   20                         24   

   host_verifications  latitude  longitude  accommodates  bathrooms  bedrooms  \
0                   2  37.97251   23.72772            10        3.0       4.0   

   beds  price  number_of_reviews  instant_bookable  \
0   7.0  432.0                181                 1   

   calculated_host_listings_count  reviews_per_month  
0                              12               2.51  


In [7]:
# Inspect missing / NaN values in the numeric DataFrame
nan_counts = df_numeric.isna().sum().sort_values(ascending=False)
nan_percent = (nan_counts / len(df_numeric) * 100).round(2)

nan_summary = pd.concat([nan_counts, nan_percent], axis=1)
nan_summary.columns = ["n_missing", "pct_missing"]

print("--- Missing values per column (desc) ---")
display(nan_summary)

print("--- Basic stats ---")
cols_to_exclude = ['id', 'host_id', 'latitude', 'longitude']
display(df_numeric.drop(columns=cols_to_exclude).describe())

--- Missing values per column (desc) ---


Unnamed: 0,n_missing,pct_missing
reviews_per_month,239,16.33
host_response_rate,176,12.02
beds,98,6.69
bathrooms,96,6.56
price,96,6.56
host_acceptance_rate,83,5.67
bedrooms,5,0.34
host_id,0,0.0
id,0,0.0
host_listings_count,0,0.0


--- Basic stats ---


Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,instant_bookable,calculated_host_listings_count,reviews_per_month
count,1288.0,1381.0,1464.0,1464.0,1464.0,1464.0,1464.0,1368.0,1459.0,1366.0,1368.0,1464.0,1464.0,1464.0,1225.0
mean,0.981506,0.944757,0.411885,27.719945,38.612705,2.072404,3.689208,1.214181,1.413297,1.99634,100.635965,53.013661,0.659836,14.160519,1.757992
std,0.100442,0.172961,0.492343,77.887063,123.300974,0.442392,1.974823,0.497797,0.846109,1.40146,184.115213,90.577096,0.473926,22.611269,1.677437
min,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,15.0,0.0,0.0,1.0,0.01
25%,1.0,0.99,0.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,47.0,2.0,0.0,1.0,0.48
50%,1.0,1.0,0.0,6.0,7.0,2.0,4.0,1.0,1.0,2.0,68.0,15.0,1.0,4.0,1.14
75%,1.0,1.0,1.0,24.25,29.25,2.0,4.0,1.0,2.0,2.0,110.0,60.0,1.0,15.0,2.63
max,1.0,1.0,1.0,906.0,2058.0,3.0,16.0,5.0,7.0,18.0,5844.0,831.0,1.0,125.0,10.77


Część danych zawiera Nany (ze względu na braki w danych lub informacje o ich braku 'N/A') więc odpowiednio to obsłużymy.



# Strategia
1. reviews_per_month -> fill(o)
2. host_response_rate & host_acceptance_rate -> average()
3. beds, bedrooms, bathrooms -> mediana
4. price -> drop (bo cena jest dość mocno skorelowana z zmienna celu, więc nie powinniśmy wprowadzać szumu)

In [4]:
# Create numeric-only dataframe (with strategy to handle NaNs)
df_numeric_v2 = create_numerical_dataset(ProcessedCSV.LISTINGS.path,strategy=True)


# Sprawdzenie czy coś zostało
print(df_numeric_v2.isna().sum())
# print("-----------------")
# # print("Shape:", df_numeric.shape)
# display(df_numeric_v2.head())


id                                0
host_id                           0
host_response_rate                0
host_acceptance_rate              0
host_is_superhost                 0
host_listings_count               0
host_total_listings_count         0
host_verifications                0
latitude                          0
longitude                         0
accommodates                      0
bathrooms                         0
bedrooms                          0
beds                              0
price                             0
number_of_reviews                 0
instant_bookable                  0
calculated_host_listings_count    0
reviews_per_month                 0
dtype: int64


# Mergujemy atrybuty z listings z statystykami

In [2]:
df_numeric = create_numerical_dataset(ProcessedCSV.LISTINGS.path, strategy=True)

# 2. Łączymy ze statystykami i dodajemy target
df_final = merge_with_stats(df_numeric)

# 3. Weryfikacja
print(f"Kształt danych: {df_final.shape}")
print("\nRozkład długich pobytów:")
print(df_final['target'].value_counts())

with pd.option_context('display.max_columns', None):
    display(df_final.head())

Kształt danych: (1368, 19)

Rozkład długich pobytów:
target
0    994
1    374
Name: count, dtype: int64


Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,instant_bookable,calculated_host_listings_count,reviews_per_month,total_bookings,target
0,0.86,1.0,1,20,24,2,37.97251,23.72772,10,3.0,4.0,7.0,432.0,181,1,12,2.51,19,0
1,1.0,1.0,0,109,119,3,37.97536,23.73172,2,1.0,1.0,2.0,72.0,22,1,70,0.53,4,1
2,1.0,1.0,0,237,241,3,37.96341,23.71779,7,2.5,3.0,3.0,151.0,138,1,90,3.08,15,0
3,1.0,1.0,1,8,9,2,37.98059,23.71446,7,3.5,3.0,5.0,600.0,35,1,8,3.56,2,0
4,1.0,0.99,1,3,3,2,37.978738,23.73873,4,2.0,2.0,2.0,207.0,74,1,1,3.76,7,0
