- loading data
- handle missing or inconsistent values
- format amenities into one hot encoded variables ( 1 if amenity is available, 0 if not)
- only use top 250 amenities (out of approximately 3000)

In [43]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [44]:
data = pd.read_csv('listings.csv')
print(data.shape)

(24264, 79)


In [45]:
data.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,6499,https://www.airbnb.com/rooms/6499,20250308054758,2025-03-16,city scrape,Belém 1 Bedroom Historical Apartment,"This apartment is all about Location, next to ...","To get to the city center, tram 15 stops in f...",https://a0.muscache.com/pictures/6422ee92-c84e...,14455,...,4.89,4.84,4.4,,t,1,1,0,0,0.64
1,25659,https://www.airbnb.com/rooms/25659,20250308054758,2025-03-09,city scrape,Heart of Alfama - Le cœur d'Alfama (3 people),Charming apartment in Lisbon's historic Alfama...,"The Alfama neighborhood, is the oldest and mos...",https://a0.muscache.com/pictures/miso/Hosting-...,107347,...,4.95,4.87,4.82,56539/AL.,t,1,1,0,0,1.62
2,29396,https://www.airbnb.com/rooms/29396,20250308054758,2025-03-11,city scrape,Alfama Hill - Boutique apartment,Feel at home in the historic centre of Lisbon.,Great neighborhood in the old part of town. Am...,https://a0.muscache.com/pictures/163913/7d622c...,126415,...,4.92,4.87,4.73,28737/AL,t,1,1,0,0,2.67
3,29720,https://www.airbnb.com/rooms/29720,20250308054758,2025-03-09,city scrape,TheHOUSE - Your luxury home,"A house at the top of an anonimous building, T...","Location in the city's Embassies area, near en...",https://a0.muscache.com/pictures/7c977dcc-57d0...,128075,...,4.97,4.87,4.7,55695/AL,f,1,1,0,0,0.82
4,29915,https://www.airbnb.com/rooms/29915,20250308054758,2025-03-16,city scrape,Modern and Spacious Apartment in Lisboa,Non-smoking modern and equipped apartment. Qui...,"Location:<br />Entrecampos, Avenida das Forças...",https://a0.muscache.com/pictures/9b572932-8d23...,128890,...,4.63,4.61,4.54,85851/AL.,f,1,1,0,0,0.34


In [46]:
idx = np.arange(data.shape[0])
target_clm='price'
from sklearn.model_selection import train_test_split
X = data.loc[idx, data.columns == 'amenities'].values
y = data.loc[idx, data.columns == target_clm].values

In [47]:
#one-hot-encoding
import ast

# Convert amenities column from string to list
data['amenities'] = data['amenities'].apply(ast.literal_eval)

print(type(data['amenities'].iloc[0])) 

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
one_hot = mlb.fit_transform(data['amenities'])

X = pd.DataFrame(one_hot, columns=mlb.classes_)

print(X.head())
print(X['Dishes and silverware'])

<class 'list'>
    Balay stainless steel oven   Scandinavian White body soap  \
0                            0                              0   
1                            0                              0   
2                            0                              0   
3                            0                              0   
4                            0                              0   

    Scandinavian White shampoo   mitsubishi  refrigerator  \
0                            0                          0   
1                            0                          0   
2                            0                          0   
3                            0                          0   
4                            0                          0   

   ( First Day amenities provided) body soap  \
0                                          0   
1                                          0   
2                                          0   
3                                  

In [48]:
amenity_counts = X.sum().sort_values(ascending=False)
print(amenity_counts)
# Get top 250 amenities by count
top_250_amenities = amenity_counts.head(250).index.tolist()

# Reduce X to only these amenities
X_reduced = X[top_250_amenities]
X_reduced.shape
X = X_reduced

Wifi                    22232
Kitchen                 22157
Hot water               19796
Hair dryer              19423
Essentials              19329
                        ...  
Fast wifi – 466 Mbps        1
Fast wifi – 469 Mbps        1
Fast wifi – 470 Mbps        1
Fast wifi – 471 Mbps        1
其他 induction stove          1
Length: 3864, dtype: int64


In [49]:
target_clm='price'
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

test_size=0.2
y = data.loc[idx, data.columns == target_clm].values

In [50]:
y.shape

(24264, 1)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size,random_state=42)

In [52]:
y_test.shape

(4853, 1)

In [53]:
for i in range(10):
    print(y_test[i])# formated as string with $


['$127.00']
['$91.00']
['$260.00']
['$28.00']
['$52.00']
['$49.00']
['$75.00']
['$23.00']
['$51.00']
['$57.00']


In [54]:
type(y_test)
print(X_train.shape[0], len(y_train))

19411 19411


In [55]:
# Flatten and clean y_train
y_train_flat = y_train.flatten()
y_train_series = pd.Series(y_train_flat, index=X_train.index)  # <--- key fix
y_train_clean = pd.to_numeric(y_train_series.str.replace('$', '', regex=False), errors='coerce')

# Create mask
mask = ~y_train_clean.isna()


# Apply mask
y_train_final = y_train_clean[mask].to_numpy()

if isinstance(X_train, pd.DataFrame):
    X_train_final = X_train.loc[mask]
else:
    X_train_final = X_train[mask.to_numpy()]


#Flatten and clean y_test

y_test_flat = y_test.flatten()
y_test_series = pd.Series(y_test_flat, index=X_test.index)  # <--- key fix
y_test_clean = pd.to_numeric(y_test_series.str.replace('$', '', regex=False), errors='coerce')

#Create mask
mask = ~y_test_clean.isna()

#apply mask
y_test_final = y_test_clean[mask].to_numpy()

if isinstance(X_test, pd.DataFrame):
    X_test_final = X_test.loc[mask]
else:
    X_test_final = X_test[mask.to_numpy()]


In [56]:
import os

# Create the 'abgabe' directory if it doesn't exist
os.makedirs('data', exist_ok=True)
X_train_final.to_csv("data/X_train_final.csv", index=False)
df=pd.DataFrame(y_train_final)
df.to_csv("data/y_train_final.csv", index=False)
X_test_final.to_csv("data/X_test_final.csv", index=False)
df=pd.DataFrame(y_test_final)
df.to_csv("data/y_test_final.csv", index=False)
X.to_csv("data/X.csv", index=False)