In [1]:
# Importing Libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
import math
import matplotlib.ticker as mtick
import warnings


In [2]:
# Set options to display all rows
pd.set_option('display.max_rows', None)

# Set options to display all columns
pd.set_option('display.max_columns', None)

# Ignore warnings
warnings.filterwarnings("ignore")

In [3]:
# Load the datasets
raw_test_x = pd.read_csv("../data/raw/airbnb_test_x.csv")

In [4]:
raw_test_x.shape

(10000, 61)

In [5]:
drop_list = ["access",
             "host_about",
             "host_acceptance_rate",
             "interaction",
             "jurisdiction_names",
             "license",
             "monthly_price",
             "neighborhood_group",
             "neighborhood_overview",
             "notes",
             "security_deposit",
             "square_feet",
             "weekly_price",
             'name',
             'summary',
             'space',
             'description',
             'experiences_offered',
             'host_name',
             'host_location', 
             'host_neighbourhood', 
             'street', 
             'neighborhood',
             'state',
             'zipcode',
             'market',
             'smart_location',
             'country_code',
             'house_rules',
             'transit',
             'country'
            ]
raw_test_x = raw_test_x.drop(columns=drop_list, axis=1)

In [6]:
#lets divide the data based on categorical and numerical features

num_features = [feature for feature in raw_test_x.columns if raw_test_x[feature].dtype in ['float64', 'int64']] 
cat_features = [feature for feature in raw_test_x.columns if feature not in num_features]

In [7]:
#1. lets deal with date features - host_since and first_review

# Convert date strings to datetime objects
raw_test_x['host_since'] = pd.to_datetime(raw_test_x['host_since'], errors='coerce')
raw_test_x['first_review'] = pd.to_datetime(raw_test_x['first_review'], errors='coerce')

# Get today's date
today = pd.to_datetime(datetime.today().date())

# Create new numerical features
raw_test_x['Days_since_host_joined'] = (today - raw_test_x['host_since']).dt.days
raw_test_x['Days_since_first_review'] = (today - raw_test_x['first_review']).dt.days

#drop the old features
raw_test_x = raw_test_x.drop(columns=['host_since','first_review'], axis = 1)

In [8]:
# 2. amenities
# Step 1: Clean and split the amenities strings into lists
raw_test_x['amenities_cleaned'] = raw_test_x['amenities'].fillna('').apply(lambda x: [a.strip() for a in x.split(',') if a.strip()])

# Step 2: Identify top 20 most frequent amenities
all_amenities = [amenity for sublist in raw_test_x['amenities_cleaned'] for amenity in sublist]
top_20_amenities = set([amenity for amenity, _ in Counter(all_amenities).most_common(20)])

# Step 3: Filter each listing's amenities to only include top 20
raw_test_x['amenities_top_20'] = raw_test_x['amenities_cleaned'].apply(lambda x: [a for a in x if a in top_20_amenities])

# Step 4: One-hot encode using only top 20
mlb = MultiLabelBinarizer()
amenities_encoded = pd.DataFrame(mlb.fit_transform(raw_test_x['amenities_top_20']),
                                 columns=[f'amenity_{a}' for a in mlb.classes_],
                                 index=raw_test_x.index)

# Step 5: Merge into final dataframe and drop unused columns
raw_test_x = pd.concat([raw_test_x.drop(columns=['amenities', 'amenities_cleaned', 'amenities_top_20']), amenities_encoded], axis=1)

In [9]:
# 3. host_verifications
# Clean and split the string into list
raw_test_x['host_verifications_cleaned'] = raw_test_x['host_verifications'].fillna('').apply(
    lambda x: [v.strip() for v in x.split(',') if v.strip()]
)

# Flatten the list and count frequency
all_verifications = [v for sublist in raw_test_x['host_verifications_cleaned'] for v in sublist]
top_n = 10
top_verifications = Counter(all_verifications).most_common(top_n)

# Get just the top N verification methods
top_verif_set = set([v for v, _ in top_verifications])

# Keep only top N in each row
raw_test_x['host_verifications_top'] = raw_test_x['host_verifications_cleaned'].apply(
    lambda x: [v for v in x if v in top_verif_set]
)

# One-hot encode
mlb = MultiLabelBinarizer()
verif_encoded = pd.DataFrame(
    mlb.fit_transform(raw_test_x['host_verifications_top']),
    columns=[f'verif_{v}' for v in mlb.classes_],
    index=raw_test_x.index
)

# Merge with main DataFrame
raw_test_x = pd.concat(
    [raw_test_x.drop(columns=['host_verifications', 'host_verifications_cleaned', 'host_verifications_top']),
     verif_encoded],
    axis=1
)

In [10]:
#4. features
# Clean and split
raw_test_x['features_cleaned'] = raw_test_x['features'].fillna('').apply(
    lambda x: [f.strip() for f in x.split(',') if f.strip()]
)

# Flatten and count
all_features = [f for sublist in raw_test_x['features_cleaned'] for f in sublist]
top_n = 8
top_features = Counter(all_features).most_common(top_n)

# Get top N as a set
top_feature_set = set([f for f, _ in top_features])

# Filter each row to retain only top N
raw_test_x['features_top'] = raw_test_x['features_cleaned'].apply(
    lambda x: [f for f in x if f in top_feature_set]
)

# One-hot encode
mlb = MultiLabelBinarizer()
features_encoded = pd.DataFrame(
    mlb.fit_transform(raw_test_x['features_top']),
    columns=[f'feature_{f}' for f in mlb.classes_],
    index=raw_test_x.index
)

# Final merge and cleanup
raw_test_x = pd.concat(
    [raw_test_x.drop(columns=['features', 'features_cleaned', 'features_top']),
     features_encoded],
    axis=1
)

In [11]:
raw_test_x.shape

(10000, 65)

In [12]:
raw_test_x_2 = raw_test_x.copy()

In [13]:
#removing spaces from column names
raw_test_x_2.columns = raw_test_x_2.columns.str.replace(' ', '_').str.lower()

In [14]:
num_features = [feature for feature in raw_test_x_2.columns if raw_test_x_2[feature].dtype in ['float64', 'int64']] 
cat_features = [feature for feature in raw_test_x_2.columns if feature not in num_features]

In [15]:
# 5. host_response_time
# Define ordinal mapping
response_time_mapping = {
    "within an hour": 3,
    "within a few hours": 2,
    "within a day": 1,
    "a few days or more": 0
}

# Apply mapping
raw_test_x_2['host_response_time_encoded'] = raw_test_x_2['host_response_time'].map(response_time_mapping)

raw_test_x_2.drop(columns=['host_response_time'], inplace=True)

In [16]:
# 6. bed_type
raw_test_x_2['bed_type_encoded'] = raw_test_x_2['bed_type'].apply(lambda x: 1 if x == 'Real Bed' else 0)
raw_test_x_2.drop(columns=['bed_type'], inplace=True)

In [17]:
# 7. property_type

property_type_mapping = {
    'Other': 0,
    'Condominium': 1,
    'Townhouse': 2,
    'Loft': 3,
    'House': 4,
    'Apartment': 5
}

raw_test_x_2['property_type_encoded'] = raw_test_x_2['property_type'].apply(
    lambda x: x if x in property_type_mapping else 'Other'
)

raw_test_x_2['property_type_encoded'] = raw_test_x_2['property_type_encoded'].map(property_type_mapping)


raw_test_x_2.drop(columns=['property_type'], inplace=True)

In [18]:
# 8. room_type

# One-hot encode with drop_first=True
room_type_dummies = pd.get_dummies(raw_test_x_2['room_type'], prefix='room_type', drop_first=True)

# Convert boolean to int (True → 1, False → 0)
room_type_dummies = room_type_dummies.astype(int)

# Add to dataframe and drop original
raw_test_x_2 = pd.concat([raw_test_x_2, room_type_dummies], axis=1)
raw_test_x_2.drop(columns=['room_type'], inplace=True)

In [19]:
#removing spaces from column names
raw_test_x_2.columns = raw_test_x_2.columns.str.replace(' ', '_').str.lower()

In [20]:

# 9. cancellation_policy

# Define rare policies
rare_policies = ['no_refunds', 'super_strict_30', 'super_strict_60']

# Replace rare with 'other'
raw_test_x_2['cancellation_policy_clean'] = raw_test_x_2['cancellation_policy'].apply(
    lambda x: 'other' if x in rare_policies else x
)

# Apply manual ordinal encoding
cancellation_policy_mapping = {
    'other': 0,
    'flexible': 1,
    'moderate': 2,
    'strict': 3
}

raw_test_x_2['cancellation_policy_encoded'] = raw_test_x_2['cancellation_policy_clean'].map(cancellation_policy_mapping)

# Optional: drop intermediate/old columns
raw_test_x_2.drop(columns=['cancellation_policy', 'cancellation_policy_clean'], inplace=True)

In [21]:
# 10. city

# Fixing extreme skew + noise (like "brooklyn", "Brooklyn", "BROOKLYN")
raw_test_x_2['city_clean'] = raw_test_x_2['city'].str.strip().str.lower()

# Step 2: Get top 10 cities by frequency
top_10_cities = raw_test_x_2['city_clean'].value_counts().head(10).index.tolist()

# Step 3: Group all other cities under 'other'
raw_test_x_2['city_grouped'] = raw_test_x_2['city_clean'].apply(lambda x: x if x in top_10_cities else 'other')

# Step 4: One-hot encode (drop_first to avoid dummy trap) and convert to 0/1
city_dummies = pd.get_dummies(raw_test_x_2['city_grouped'], prefix='city', drop_first=True).astype(int)

# Step 5: Combine and clean up
raw_test_x_2 = pd.concat([raw_test_x_2, city_dummies], axis=1)

raw_test_x_2.drop(columns=['city', 'city_clean', 'city_grouped'], inplace=True)

In [22]:
# Step 1: Load your processed train and test data
train_df = pd.read_csv("../data/processed/train_exp_ready_no_feature_scaling.csv")

# Step 2: Identify all city-related dummy columns from the train set
city_columns = [col for col in train_df.columns if col.startswith('city_')]

# Step 3: Add missing city columns to test_df and fill with 0
for col in city_columns:
    if col not in raw_test_x_2.columns:
        raw_test_x_2[col] = 0

# Step 4: Drop extra city columns in test_df (not in training)
for col in raw_test_x_2.columns:
    if col.startswith('city_') and col not in city_columns:
        raw_test_x_2.drop(columns=[col], inplace=True)

# Step 5: Optional — reorder columns to match train_df (if needed)
# This is helpful if you want to ensure alignment before predict
columns_order = [col for col in train_df.columns if col != 'high_booking_rate']
raw_test_x_2 = raw_test_x_2[columns_order]

In [23]:
processed_test_x = raw_test_x_2.copy()

In [24]:
#output this data into a csv as train_data_proccessed_1
processed_test_x.to_csv('../data/processed/processed_test_x.csv', index=False)