In [8]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter

In [9]:
df = pd.read_csv('../../IUM25Z_Zad_07_01_v2/listings.csv/listings.csv')

print(f"Liczba rekordów: {len(df)}")
print(f"\nPrzykładowe amenities:")
print(df['amenities'].head(3))


Liczba rekordów: 1464

Przykładowe amenities:
0    ["Coffee", "Outlet covers", "Dishes and silver...
1    ["Heating - split type ductless system", "Hair...
2    ["Coffee", "Dishes and silverware", "Cleaning ...
Name: amenities, dtype: object


In [10]:
def parse_amenities(amenities_str):
    """Parsuje string z amenities do listy"""
    if pd.isna(amenities_str):
        return []
    try:
        # Usuń cudzysłowy i parsuj jako JSON
        amenities_list = json.loads(amenities_str)
        return [a.strip() for a in amenities_list if a.strip()]
    except:
        return []

df['amenities_list'] = df['amenities'].apply(parse_amenities)

In [11]:
all_amenities = []
for amen_list in df['amenities_list']:
    all_amenities.extend(amen_list)

amenities_counter = Counter(all_amenities)
print(f"\n{'='*60}")
print(f"STATYSTYKI AMENITIES")
print(f"{'='*60}")
print(f"Unikalne amenities: {len(amenities_counter)}")
print(f"\nTop 30 najpopularniejszych:")
for amenity, count in amenities_counter.most_common(30):
    print(f"{amenity:.<50} {count:>5} ({count/len(df)*100:.1f}%)")


STATYSTYKI AMENITIES
Unikalne amenities: 915

Top 30 najpopularniejszych:
Wifi..............................................  1333 (91.1%)
Kitchen...........................................  1296 (88.5%)
Hair dryer........................................  1285 (87.8%)
Hot water.........................................  1237 (84.5%)
Hangers...........................................  1211 (82.7%)
Iron..............................................  1201 (82.0%)
Dishes and silverware.............................  1175 (80.3%)
Bed linens........................................  1150 (78.6%)
Essentials........................................  1124 (76.8%)
Cooking basics....................................  1113 (76.0%)
Shampoo...........................................  1062 (72.5%)
Refrigerator......................................  1024 (69.9%)
Air conditioning..................................  1007 (68.8%)
Hot water kettle..................................   927 (63.3%)
TV.............

In [None]:
def standardize_amenity(amenity):
    """Standaryzuje nazwę amenity"""
    amenity = amenity.lower().strip()

    # Usunięcie szczegółów w nawiasach i po myślniku
    amenity = re.sub(r'\s*[\-–]\s*.*$', '', amenity)
    amenity = re.sub(r'\s*\(.*?\)', '', amenity)
    amenity = re.sub(r'\s*:.*$', '', amenity)

    # Mapowanie synonimiów
    synonyms = {
        'wifi': ['wi-fi', 'wireless internet', 'internet', 'fast wifi'],
        'tv': ['television', 'hdtv', 'smart tv', 'tv with'],
        'ac': ['air conditioning', 'ac ', 'window ac', 'central air'],
        'heating': ['heat', 'central heating', 'heating '],
        'kitchen': ['full kitchen', 'kitchenette'],
        'washer': ['washing machine', 'free washer'],
        'dryer': ['free dryer', 'drying machine'],
        'parking': ['free parking', 'paid parking', 'street parking', 'garage'],
        'pool': ['swimming pool', 'shared pool', 'private pool'],
        'gym': ['fitness center', 'workout room'],
        'elevator': ['lift'],
        'workspace': ['dedicated workspace', 'desk'],
    }

    for standard, variants in synonyms.items():
        if any(var in amenity for var in variants):
            return standard

    return amenity

In [None]:
CATEGORIES = {
    'connectivity': ['wifi', 'ethernet', 'fast wifi'],
    'entertainment': ['tv', 'netflix', 'cable', 'sound system', 'bluetooth'],
    'kitchen': ['kitchen', 'refrigerator', 'microwave', 'oven', 'stove',
                'dishwasher', 'coffee maker', 'toaster', 'blender', 'freezer',
                'cooking basics', 'dishes', 'wine glasses'],
    'laundry': ['washer', 'dryer', 'drying rack', 'iron', 'laundromat'],
    'climate': ['ac', 'heating', 'fan', 'ceiling fan'],
    'bathroom': ['hair dryer', 'shampoo', 'shower gel', 'body soap',
                 'conditioner', 'bathtub', 'essentials'],
    'sleeping': ['bed linens', 'extra pillows', 'hangers', 'clothing storage'],
    'outdoor': ['patio', 'balcony', 'garden', 'bbq', 'outdoor furniture', 'backyard'],
    'parking': ['parking', 'garage'],
    'family': ['crib', 'high chair', 'children', 'pack', 'outlet covers'],
    'workspace': ['workspace', 'desk', 'dedicated workspace'],
    'accessibility': ['elevator', 'single level', 'accessible'],
    'safety': ['smoke alarm', 'carbon monoxide', 'first aid', 'fire extinguisher', 'safe'],
    'convenience': ['self check-in', 'lockbox', 'private entrance', 'luggage dropoff'],
    'premium': ['pool', 'gym', 'hot tub', 'fireplace', 'view']
}

def categorize_amenity(amenity):
    """Przypisuje amenity do kategorii"""
    amenity_lower = amenity.lower()
    categories_found = []

    for category, keywords in CATEGORIES.items():
        if any(keyword in amenity_lower for keyword in keywords):
            categories_found.append(category)

    return categories_found if categories_found else ['other']


In [None]:
# Standaryzacja wszystkich amenities
df['amenities_standardized'] = df['amenities_list'].apply(
    lambda x: [standardize_amenity(a) for a in x]
)

# Liczba amenities
df['amenities_count'] = df['amenities_list'].apply(len)

# Binary features dla TOP amenities (występujących w >20% ofert)
threshold = len(df) * 0.2
top_amenities = [a for a, count in amenities_counter.most_common() if count > threshold]

print(f"\n{'='*60}")
print(f"TWORZENIE BINARY FEATURES")
print(f"{'='*60}")
print(f"Amenities występujące w >20% ofert: {len(top_amenities)}")

for amenity in top_amenities:
    column_name = f'has_{standardize_amenity(amenity).replace(" ", "_")}'
    df[column_name] = df['amenities_list'].apply(
        lambda x: 1 if any(amenity.lower() in a.lower() for a in x) else 0
    )

# Kategorie (liczba amenities w każdej kategorii)
for category in CATEGORIES.keys():
    df[f'amenities_{category}_count'] = df['amenities_standardized'].apply(
        lambda amen_list: sum(
            1 for a in amen_list
            if category in categorize_amenity(a)
        )
    )

# Binary: czy ma przynajmniej 1 amenity z kategorii
for category in CATEGORIES.keys():
    df[f'has_{category}'] = (df[f'amenities_{category}_count'] > 0).astype(int)


TWORZENIE BINARY FEATURES
Amenities występujące w >20% ofert: 47


In [15]:
luxury_keywords = ['pool', 'hot tub', 'gym', 'concierge', 'sauna', 'wine', 'netflix']
df['luxury_amenities_count'] = df['amenities_list'].apply(
    lambda x: sum(1 for a in x if any(kw in a.lower() for kw in luxury_keywords))
)

# Rodzinne amenities
family_keywords = ['crib', 'high chair', 'children', 'pack', 'playground']
df['family_amenities_count'] = df['amenities_list'].apply(
    lambda x: sum(1 for a in x if any(kw in a.lower() for kw in family_keywords))
)

In [16]:

print(f"\n{'='*60}")
print(f"PODSUMOWANIE STWORZONYCH FEATURES")
print(f"{'='*60}")

# Kolumny z amenities
amenities_columns = [col for col in df.columns if 'amenities' in col or 'has_' in col]
print(f"\nStworzono {len(amenities_columns)} nowych kolumn:")
print(f"  - amenities_count: liczba wszystkich amenities")
print(f"  - has_[amenity]: {len([c for c in amenities_columns if c.startswith('has_')])} binary features")
print(f"  - amenities_[category]_count: liczba amenities w kategorii")
print(f"  - luxury_amenities_count, family_amenities_count")

# Statystyki
print(f"\n{'='*60}")
print(f"STATYSTYKI OPISOWE")
print(f"{'='*60}")
print(df[['amenities_count', 'luxury_amenities_count', 'family_amenities_count']].describe())

# Top korelacje z liczbą amenities
numeric_cols = df.select_dtypes(include=[np.number]).columns
if 'price' in df.columns:
    df['price_numeric'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)
    print(f"\nKorelacja amenities_count z ceną: {df['amenities_count'].corr(df['price_numeric']):.3f}")



PODSUMOWANIE STWORZONYCH FEATURES

Stworzono 81 nowych kolumn:
  - amenities_count: liczba wszystkich amenities
  - has_[amenity]: 59 binary features
  - amenities_[category]_count: liczba amenities w kategorii
  - luxury_amenities_count, family_amenities_count

STATYSTYKI OPISOWE
       amenities_count  luxury_amenities_count  family_amenities_count
count      1464.000000             1464.000000             1464.000000
mean         33.285519                0.739754                0.693306
std          13.980375                0.779017                1.085949
min           0.000000                0.000000                0.000000
25%          24.000000                0.000000                0.000000
50%          34.000000                1.000000                0.000000
75%          43.000000                1.000000                1.000000
max          77.000000                5.000000                5.000000

Korelacja amenities_count z ceną: 0.060


In [None]:
df_final = df.drop(['amenities_list', 'amenities_standardized'], axis=1)

# Zapis
df_final.to_csv('listings_with_amenities.csv', index=False)

print(f"\n{'='*60}")
print(f"✓ Dane zapisane do: listings_with_amenities.csv")
print(f"✓ Liczba rekordów: {len(df_final)}")
print(f"✓ Liczba kolumn: {len(df_final.columns)} (było: {len(df.columns)})")
print(f"{'='*60}")

# Przykładowe wyniki
print(f"\nPrzykładowe wartości dla pierwszego rekordu:")
example_cols = ['id', 'amenities_count', 'has_wifi', 'has_kitchen',
                'amenities_connectivity_count', 'luxury_amenities_count']
print(df_final[example_cols].head(1).T)


✓ Dane zapisane do: listings_with_amenities.csv
✓ Liczba rekordów: 1464
✓ Liczba kolumn: 152 (było: 154)

Przykładowe wartości dla pierwszego rekordu:
                                     0
id                            30419466
amenities_count                     59
has_wifi                             1
has_kitchen                          1
amenities_connectivity_count         2
luxury_amenities_count               1
