## Get to know listings data

In [1]:
import pandas as pd

In [None]:
columns_to_drop = [
    "listing_url", "scrape_id", "last_scraped", "source", "description","neighborhood_overview", "picture_url",
    "host_url", "host_name", "host_about", "host_thumbnail_url",
    "host_picture_url", "host_neighbourhood", "host_listings_count", "host_verifications", "host_has_profile_pic",
    "neighbourhood_group_cleansed", "bathrooms_text", "amenities", "minimum_minimum_nights",
    "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "minimum_nights_avg_ntm",
    "maximum_nights_avg_ntm","calendar_last_scraped", "number_of_reviews_l30d",
    "calculated_host_listings_count", "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms", "neighbourhood",
]

In [3]:
listings = pd.read_csv("../data/raw/listings.csv.gz").drop(columns=columns_to_drop)

In [20]:
import os
#os.getcwd()
os.chdir("..")

In [4]:
listings.shape

(4932, 48)

In [5]:
listings.columns

Index(['id', 'name', 'host_id', 'host_since', 'host_location',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_total_listings_count',
       'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed',
       'latitude', 'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights',
       'maximum_nights', 'calendar_updated', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'number_of_reviews_ltm',
       'availability_eoy', 'number_of_reviews_ly', 'estimated_occupancy_l365d',
       'estimated_revenue_l365d', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'license', 'instant_booka

In [7]:
listings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932 entries, 0 to 4931
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           4932 non-null   int64  
 1   name                         4932 non-null   object 
 2   host_id                      4932 non-null   int64  
 3   host_since                   4682 non-null   object 
 4   host_location                3267 non-null   object 
 5   host_acceptance_rate         4378 non-null   object 
 6   host_is_superhost            4560 non-null   object 
 7   host_total_listings_count    4682 non-null   float64
 8   host_identity_verified       4682 non-null   object 
 9   neighbourhood                1821 non-null   object 
 10  neighbourhood_cleansed       4932 non-null   object 
 11  latitude                     4932 non-null   float64
 12  longitude                    4932 non-null   float64
 13  property_type     

In [8]:
listings["host_acceptance_rate"] = listings["host_acceptance_rate"].str.rstrip('%').astype(float) / 100.0
listings["host_is_superhost"] = listings["host_is_superhost"].map({'t': True, 'f': False})
listings["host_identity_verified"] = listings["host_identity_verified"].map({'t': True, 'f': False})
listings["price"] = listings["price"].str.lstrip('$').str.replace(',', '').astype(float)
listings["has_availability"] = listings["has_availability"].map({'t': True, 'f': False})
listings["instant_bookable"] = listings["instant_bookable"].map({'t': True, 'f': False})

In [9]:
type_dict = {
    "id": "object",
    "host_id": "object",
    "host_since": "datetime64[ns]",
    "host_total_listings_count": "Int16",
    "neighbourhood": "category",
    "neighbourhood_cleansed": "category",
    "property_type": "category",
    "room_type": "category",
    "accommodates": "Int16",
    "bedrooms": "Int16",
    "beds": "Int16",
    "minimum_nights": "Int16",
    "maximum_nights": "Int16",
}

In [10]:
listings = listings.astype(type_dict)

In [11]:
listings.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4932 entries, 0 to 4931
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   id                           4932 non-null   object        
 1   name                         4932 non-null   object        
 2   host_id                      4932 non-null   object        
 3   host_since                   4682 non-null   datetime64[ns]
 4   host_location                3267 non-null   object        
 5   host_acceptance_rate         4378 non-null   float64       
 6   host_is_superhost            4560 non-null   object        
 7   host_total_listings_count    4682 non-null   Int16         
 8   host_identity_verified       4682 non-null   object        
 9   neighbourhood                1821 non-null   category      
 10  neighbourhood_cleansed       4932 non-null   category      
 11  latitude                     4932 non-null 

In [6]:
listings.loc[listings["host_since"] > "2025-06-23"].shape[0]

0

In [12]:
listings.to_parquet("../data/processed/listings_cleaned.parquet", engine="pyarrow", index=False)

In [4]:
import os

os.chdir("..")
#os.getcwd()

In [8]:
listings.sample().T

Unnamed: 0,3343
id,1163771498162327165
name,Nice & Quiet Toumba Area / 4P
host_id,579415261
host_since,2024-05-24
host_location,
host_response_time,within a day
host_response_rate,100%
host_acceptance_rate,100%
host_is_superhost,t
host_total_listings_count,1.0


In [9]:
listings["host_response_time"].unique()

array([nan, 'a few days or more', 'within an hour', 'within a day',
       'within a few hours'], dtype=object)

In [13]:
listings["property_type"].unique()

array(['Entire rental unit', 'Entire condo', 'Entire serviced apartment',
       'Private room in condo', 'Entire loft',
       'Private room in rental unit', 'Entire home', 'Tiny home',
       'Entire place', 'Entire townhouse', 'Entire villa', 'Dome',
       'Camper/RV', 'Room in boutique hotel',
       'Private room in bed and breakfast', 'Private room in home',
       'Entire guest suite', 'Entire guesthouse', 'Room in aparthotel',
       'Boat', 'Private room in guest suite', 'Entire vacation home',
       'Shared room in condo', 'Room in hotel', 'Earthen home',
       'Private room in guesthouse', 'Private room in villa',
       'Private room in serviced apartment', 'Tent',
       'Shared room in hostel', 'Barn', 'Dammuso'], dtype=object)

In [15]:
listings["room_type"].unique()

array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)

<hr>

### Anonymize id

In [10]:
import hashlib
import pandas as pd

def anonymize_listing_id(series):
    """
    Anonymize listing IDs using deterministic hashing.
    
    Preserves: Uniqueness, consistency for joins
    Removes: Searchability, reverse-lookup capability
    
    Args:
        series (pd.Series): Original listing ID column
        
    Returns:
        pd.Series: Anonymized IDs in format 'PROP_XXXX'
        
    Example:
        12345678 -> PROP_A7B3
        98765432 -> PROP_C2F9
    """
    def hash_id(listing_id):
        if pd.isna(listing_id):
            return None
        # Create deterministic hash (same ID always gets same result)
        hash_obj = hashlib.md5(str(int(listing_id)).encode())
        # Take first 4 characters of hex digest for readability
        short_hash = hash_obj.hexdigest()[:4].upper()
        return f"PROP_{short_hash}"
    
    return series.apply(hash_id)


In [11]:
samp = listings["id"].sample(20)
samp

2754    1036173665804429226
259                20346631
1047               43624182
3387    1171657554360267464
1592     609031784891139796
523                28867054
2027     822392579788069436
2187     873760221403457515
1812     717185950314433874
1355               52663564
2365     923057846185650256
821                37224718
1912     760788421735036104
2830    1056625308216830035
1611     622452817517411408
210                18369648
4049    1300637998940058578
1810     714497434021397158
141                14333612
1928     772311892391467827
Name: id, dtype: int64

In [12]:
samp_anon = anonymize_listing_id(samp)
samp_anon

2754    PROP_96B7
259     PROP_AC7C
1047    PROP_041F
3387    PROP_3AB2
1592    PROP_F355
523     PROP_A18A
2027    PROP_E178
2187    PROP_7A58
1812    PROP_DCC5
1355    PROP_0D12
2365    PROP_3B39
821     PROP_73C8
1912    PROP_C710
2830    PROP_084F
1611    PROP_640C
210     PROP_3E07
4049    PROP_4531
1810    PROP_97A7
141     PROP_4879
1928    PROP_8C4A
Name: id, dtype: object

<hr>

### Anonymize listing names

In [26]:
def anonymize_listing_name(df, name_col='name', property_type_col='property_type', 
                           room_type_col='room_type'):
    """
    Generate anonymous but descriptive listing names.
    
    Preserves: Property character for analysis/storytelling
    Removes: Personal identifiers, searchable text
    
    Args:
        df (pd.DataFrame): Dataset with listing info
        name_col: Column name for listing names
        property_type_col: Column for property type (e.g., 'Apartment', 'House')
        room_type_col: Column for room type (e.g., 'Entire home/apt')
        
    Returns:
        pd.Series: Anonymized names in format 'Apartment - Entire Home #0042'
    """
    
    # Clean property type for missing values
    property_clean = df[property_type_col].fillna('Property')
    room_clean = df[room_type_col].fillna('Listing')
    
    # Create base descriptor combining property and room type
    base_descriptor = df[name_col].str[:5] + ' - ' + property_clean.str[:4] + ' - ' + room_clean.str[:2]
    
    # Add sequential number within each category for uniqueness
    sequential_num = df.groupby([property_type_col, room_type_col]).cumcount() + 1
    
    # Format with zero-padding for sorting
    anonymous_names = base_descriptor + ' #' + sequential_num.astype(str).str.zfill(4)
    
    return anonymous_names


In [17]:
samp1 = listings[['id', 'name', 'property_type', 'room_type']].sample(10)
samp1

Unnamed: 0,id,name,property_type,room_type
3429,1178869102910326803,"Stylish City Flat ‚Äì 300Mbps WiFi ,Walk Everywh...",Entire condo,Entire home/apt
650,32583262,Unique Loft- Thessaloniki Down Town,Entire rental unit,Entire home/apt
55,8214869,Downtown Central Cozy3BD - #8Design Flat,Entire rental unit,Entire home/apt
4447,1356454237139632068,Esther Suites | Family Loft Room,Entire loft,Entire home/apt
754,35365070,Happy Rabbit 2,Entire rental unit,Entire home/apt
3957,1288274440313123965,Hidden Boho Nest,Entire condo,Entire home/apt
506,28403705,"Quiet, delux flat with panoramic views",Entire condo,Entire home/apt
2895,1069560357276167097,"Pantheon 4.6 | Studio, Nilie Hospitality MGMT",Entire rental unit,Entire home/apt
3666,1236200262379307269,Iasonidou Resort,Entire rental unit,Entire home/apt
924,39678767,Thess unique Studio,Entire rental unit,Entire home/apt


In [21]:
samp1.name[:4] + ' - ' + samp1.property_type[:4] + ' - ' + samp1.room_type[:4]

3429    Stylish City Flat ‚Äì 300Mbps WiFi ,Walk Everywh...
650     Unique Loft- Thessaloniki Down Town - Entire r...
55      Downtown Central Cozy3BD - #8Design Flat - Ent...
4447    Esther Suites | Family Loft Room - Entire loft...
dtype: object

In [27]:
samp1_anon = anonymize_listing_name(samp1, name_col='name', property_type_col='property_type', room_type_col='room_type')
samp1_anon

3429    Styli - Enti - En #0001
650     Uniqu - Enti - En #0001
55      Downt - Enti - En #0002
4447    Esthe - Enti - En #0001
754     Happy - Enti - En #0003
3957    Hidde - Enti - En #0002
506     Quiet - Enti - En #0003
2895    Panth - Enti - En #0004
3666    Iason - Enti - En #0005
924     Thess - Enti - En #0006
dtype: object

<hr>

### Coordinates rounding

In [None]:
def anonymize_coordinates(df, lat_col='latitude', lon_col='longitude', precision=3):
    """
    Reduce coordinate precision to prevent property identification.
    
    Preserves: Neighborhood clustering, relative distances, geographic patterns
    Removes: Exact property location, building-level identification
    
    Args:
        df (pd.DataFrame): Dataset with coordinates
        lat_col: Column name for latitude
        lon_col: Column name for longitude
        precision: Decimal places to keep (default=3 for ~111m accuracy)
        
    Returns:
        tuple: (anonymized_lat, anonymized_lon) as pd.Series
        
    Precision levels:
        2 decimals = ~1.1 km
        3 decimals = ~111 m 
        4 decimals = ~11 m 
    """
    
    lat_anon = df[lat_col].round(precision)
    lon_anon = df[lon_col].round(precision)
    
    # Validation: Check for NaN values
    if lat_anon.isna().any() or lon_anon.isna().any():
        missing_count = lat_anon.isna().sum()
        print(f"‚ö†Ô∏è  Warning: {missing_count} listings have missing coordinates")
    
    # Validation: Check coordinate ranges for Thessaloniki
    # Thessaloniki: ~40.6¬∞N, ~22.9¬∞E
    lat_valid = (lat_anon >= 40.4) & (lat_anon <= 40.8)
    lon_valid = (lon_anon >= 22.7) & (lon_anon <= 23.1)
    
    invalid_count = (~(lat_valid & lon_valid) & lat_anon.notna()).sum()
    if invalid_count > 0:
        print(f"‚ö†Ô∏è  Warning: {invalid_count} listings have coordinates outside Thessaloniki bounds")
    
    return lat_anon, lon_anon


In [29]:
samp2 = listings[['id', 'latitude', 'longitude']].sample(10)
samp2

Unnamed: 0,id,latitude,longitude
4348,1341244681403059100,40.632081,22.951074
3552,1206404883939918068,40.62897,22.94873
4851,1434179355105100886,40.572815,22.969795
4775,1419714625167873476,40.63707,22.939011
4295,1332662858179247245,40.633354,22.944295
1521,575168826809441793,40.64166,22.93245
3205,1136954193165112592,40.606961,22.963729
4309,1334556410280136827,40.645236,22.919742
3443,1181892512492874814,40.642859,22.958602
2303,905080328404157392,40.636643,22.950991


In [34]:
samp2_anon = anonymize_coordinates(samp2, precision=4)
pd.DataFrame(samp2_anon).T

Unnamed: 0,latitude,longitude
4348,40.6321,22.9511
3552,40.629,22.9487
4851,40.5728,22.9698
4775,40.6371,22.939
4295,40.6334,22.9443
1521,40.6417,22.9324
3205,40.607,22.9637
4309,40.6452,22.9197
3443,40.6429,22.9586
2303,40.6366,22.951


<hr>

### Anonymize license

In [57]:
def anonymize_license(series):
    """
    Anonymize license numbers while preserving licensing status categories.
    
    Preserves: 
        - Licensed (numbered licenses -> hashed)
        - Exempt (remains 'exempt')
        - Unlicensed (null/empty -> remains null)
        - Duplicate licenses (same number -> same hash)
    
    Removes: Actual license numbers, registry searchability
    
    Args:
        series (pd.Series): Original license column
        
    Returns:
        pd.Series: Anonymized licenses with three categories:
            - 'LIC_XXXXXX' for numbered licenses
            - 'exempt' for exempt properties
            - NaN for unlicensed properties
            
    Example:
        '123456789'  -> 'LIC_A7B3C2'
        '123456789'  -> 'LIC_A7B3C2'  (duplicate preserved)
        '987654321'  -> 'LIC_F9E8D1'  (different hash)
        'exempt'     -> 'exempt'
        NaN          -> NaN
        ''           -> NaN
    """
    def hash_license(value):
        # Handle missing/empty values -> Unlicensed
        if pd.isna(value) or value == '':
            return None
        
        # Convert to string and strip whitespace
        value_str = str(value).strip().lower()
        
        # Preserve 'exempt' status as-is
        if value_str == 'exempt':
            return 'exempt'
        
        # For numbered licenses, create deterministic hash
        # MD5 ensures same license always gets same hash (duplicates preserved)
        hash_obj = hashlib.md5(str(value).encode())
        short_hash = hash_obj.hexdigest()[:6].upper()
        return f"LIC_{short_hash}"
    
    anonymized = series.apply(hash_license)
    #
    return anonymized


In [None]:
# add on "#" if necessary
    
    # Generate detailed status report
    licensed_count = anonymized.str.startswith('LIC_', na=False).sum()
    exempt_count = (anonymized == 'exempt').sum()
    unlicensed_count = anonymized.isna().sum()
    total = len(anonymized)
    
    print(f"üìä License Anonymization Summary:")
    print(f"   Licensed (numbered):  {licensed_count:4d} ({licensed_count/total*100:5.1f}%) -> Hashed")
    print(f"   Exempt:               {exempt_count:4d} ({exempt_count/total*100:5.1f}%) -> Preserved")
    print(f"   Unlicensed:           {unlicensed_count:4d} ({unlicensed_count/total*100:5.1f}%) -> Null")
    print(f"   {'‚îÄ'*60}")
    print(f"   Total:                {total:4d} (100.0%)")
    
    # Validate duplicate preservation
    original_unique = series.dropna().nunique()
    # Exclude 'exempt' and NaN from unique count for numbered licenses only
    numbered_licenses = anonymized[anonymized.str.startswith('LIC_', na=False)]
    anonymized_unique = numbered_licenses.nunique()
    
    print(f"\nüîç Duplicate Preservation Check:")
    print(f"   Original numbered licenses (unique): {original_unique}")
    print(f"   Anonymized numbered licenses (unique): {anonymized_unique}")
    if original_unique == anonymized_unique:
        print(f"   ‚úÖ All duplicates preserved correctly")
    else:
        print(f"   ‚ö†Ô∏è  Warning: Unique count mismatch - check for data issues")
    

In [58]:
samp3 = listings['license'].sample(500)
samp3

2481    00002062924
460     00000103287
3814    00002960054
2570        2374902
4397    00003198470
           ...     
965     00000924747
2764    00002376013
4699    00000907700
4391    00003185020
4834    00002472751
Name: license, Length: 500, dtype: object

In [59]:
samp3_anon = anonymize_license(samp3)
samp3_anon

2481    LIC_AF1142
460     LIC_730A7D
3814    LIC_AEBD7A
2570    LIC_26B323
4397    LIC_75D3FA
           ...    
965     LIC_DC09AA
2764    LIC_35BF10
4699    LIC_98DE81
4391    LIC_BC58C0
4834    LIC_3B8673
Name: license, Length: 500, dtype: object

In [60]:
samp3.duplicated().sum()

np.int64(17)

In [61]:
samp3_anon.duplicated().sum()

np.int64(17)

In [62]:
samp3.shape, samp3_anon.shape

((500,), (500,))

In [63]:
samp3.loc[samp3.duplicated()]

4867             1237062
1509         00002978637
3113             1307676
4326             1381448
4445    0933Œö274Œë0818300
3780              Exempt
3549              Exempt
836              1144498
205              1037314
22                   NaN
2610             1381448
2399              Exempt
35                   NaN
2596             1381448
883               Exempt
748          00000685900
1470         00002978637
Name: license, dtype: object

In [64]:
samp3_anon.loc[samp3.duplicated()]

4867    LIC_971125
1509    LIC_7C3F33
3113    LIC_092023
4326    LIC_43A271
4445    LIC_878475
3780        exempt
3549        exempt
836     LIC_BDC2D3
205     LIC_CE1F25
22            None
2610    LIC_43A271
2399        exempt
35            None
2596    LIC_43A271
883         exempt
748     LIC_EC5C2D
1470    LIC_7C3F33
Name: license, dtype: object

In [65]:
samp3.loc[samp3=="00002978637"]

1130    00002978637
1509    00002978637
1470    00002978637
Name: license, dtype: object

In [66]:
samp3_anon.loc[samp3_anon=="LIC_7C3F33"]

1130    LIC_7C3F33
1509    LIC_7C3F33
1470    LIC_7C3F33
Name: license, dtype: object

In [68]:
def validate_license_anonymization(df_original, df_anonymized):
    """
    Validate that license anonymization preserved all analytical properties.
    
    Checks:
        1. Exempt status preserved
        2. Unlicensed status preserved  
        3. Duplicates mapped consistently
        4. No original license numbers remain
    """
    
    print("üîç License Anonymization Validation\n")
    
    # Check 1: Exempt count matches
    orig_exempt = (df_original['license'].astype(str).str.lower() == 'exempt').sum()
    anon_exempt = (df_anonymized['license'] == 'exempt').sum()
    assert orig_exempt == anon_exempt, f"Exempt count mismatch! {orig_exempt} vs {anon_exempt}"
    print(f"‚úÖ Exempt properties preserved: {anon_exempt}")
    
    # Check 2: Unlicensed count matches
    orig_unlicensed = df_original['license'].isna().sum()
    anon_unlicensed = df_anonymized['license'].isna().sum()
    assert orig_unlicensed == anon_unlicensed, f"Unlicensed count mismatch!"
    print(f"‚úÖ Unlicensed properties preserved: {anon_unlicensed}")
    
    # Check 3: Duplicate preservation for numbered licenses
    # Get only numbered licenses (not exempt, not null)
    orig_numbered = df_original['license'][
        df_original['license'].notna() & 
        (df_original['license'].astype(str).str.lower() != 'exempt')
    ]
    anon_numbered = df_anonymized['license'][
        df_anonymized['license'].notna() & 
        (df_anonymized['license'] != 'exempt')
    ]
    
    # Check if duplicates are preserved
    orig_duplicates = orig_numbered[orig_numbered.duplicated(keep=False)]
    if len(orig_duplicates) > 0:
        print(f"‚úÖ Original dataset has {len(orig_duplicates)} duplicate numbered licenses")
        
        # Verify each original duplicate maps to same anonymized value
        for orig_lic in orig_duplicates.unique():
            orig_indices = df_original[df_original['license'] == orig_lic].index
            anon_values = df_anonymized.loc[orig_indices, 'license'].unique()
            assert len(anon_values) == 1, f"Duplicate license {orig_lic} mapped to multiple hashes!"
        print(f"‚úÖ All duplicate licenses map to consistent anonymized values")
    
    # Check 4: No original numbered licenses appear in anonymized data
    # (exempt is allowed to appear as-is)
    numbered_licenses_original = set(orig_numbered.unique())
    numbered_licenses_anon = set(anon_numbered.unique())
    
    overlap = numbered_licenses_original.intersection(numbered_licenses_anon)
    assert len(overlap) == 0, f"Original license numbers found in anonymized data: {overlap}"
    print(f"‚úÖ No original numbered licenses present in anonymized dataset")
    
    # Check 5: All anonymized numbered licenses follow format
    invalid_format = anon_numbered[~anon_numbered.str.match(r'^LIC_[A-F0-9]{6}$')]
    assert len(invalid_format) == 0, f"Invalid license format found: {invalid_format.unique()}"
    print(f"‚úÖ All anonymized licenses follow LIC_XXXXXX format")
    
    print(f"\n‚úÖ All license anonymization validation checks passed!")


# Usage:
validate_license_anonymization(pd.DataFrame(samp3), pd.DataFrame(samp3_anon))

üîç License Anonymization Validation

‚úÖ Exempt properties preserved: 5
‚úÖ Unlicensed properties preserved: 3
‚úÖ Original dataset has 19 duplicate numbered licenses
‚úÖ All duplicate licenses map to consistent anonymized values
‚úÖ No original numbered licenses present in anonymized dataset
‚úÖ All anonymized licenses follow LIC_XXXXXX format

‚úÖ All license anonymization validation checks passed!
