In [2]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('AirbnbData_Cleaned.csv')

In [4]:
df.dtypes

Unnamed: 0                          int64
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
city                               object
dtype: object

In [5]:
dummy = pd.get_dummies(df, columns=['city', 'room_type'], drop_first=True, prefix='DM')
print(dummy.columns)

Index(['Unnamed: 0', 'id', 'name', 'host_id', 'host_name',
       'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude',
       'price', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'DM_Los Angeles', 'DM_New York City',
       'DM_Rhode Island', 'DM_Seattle', 'DM_Hotel room', 'DM_Private room',
       'DM_Shared room'],
      dtype='object')


In [6]:
scaler = StandardScaler()

In [7]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])

In [8]:
numeric_df.columns

Index(['Unnamed: 0', 'id', 'host_id', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [9]:
scaled_df = scaler.fit_transform(numeric_df)

In [10]:
scaled_df = pd.DataFrame(scaled_df, columns=['Unnamed: 0', 'id', 'host_id', 'latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365'])

In [11]:
scaled_df.head()

Unnamed: 0.1,Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,-1.507086,-1.762305,-0.909332,-1.896695,-1.52159,-0.298959,-0.34303,0.070068,-0.478757,-0.281507,1.464175
1,-1.507063,-1.762289,-0.909328,-1.89837,-1.529925,-0.167746,0.847045,-0.449823,-0.722354,-0.198516,0.728599
2,-1.50704,-1.76228,-0.909326,-1.972065,-1.535455,-0.298959,-0.215522,2.117138,0.096401,-0.253843,0.594215
3,-1.507017,-1.76228,-0.909326,-1.955633,-1.538012,0.219167,-0.173019,-0.287357,-0.620855,-0.253843,0.912493
4,-1.506994,-1.76228,-0.909325,-1.669,-1.646581,-0.275408,-0.300527,1.710973,-0.086297,-0.30917,-0.296964


In [12]:
X_train, X_test, y_train, y_test= train_test_split(scaled_df.drop(columns='price'), scaled_df.price, test_size=0.3, random_state=47)

In [13]:
X_train.shape, X_test.shape

((59600, 10), (25544, 10))

In [14]:
y_train.shape, y_test.shape

((59600,), (25544,))

In [15]:
X_train.dtypes

Unnamed: 0                        float64
id                                float64
host_id                           float64
latitude                          float64
longitude                         float64
minimum_nights                    float64
number_of_reviews                 float64
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
dtype: object

In [16]:
X_test.dtypes

Unnamed: 0                        float64
id                                float64
host_id                           float64
latitude                          float64
longitude                         float64
minimum_nights                    float64
number_of_reviews                 float64
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
dtype: object

In [17]:
y_train

62356   -0.349426
33342   -0.477276
76208   -0.430173
71673   -0.406622
6267     0.239353
           ...   
80592   -0.359520
25267   -0.450360
23112   -0.372977
11528   -0.113914
51078    0.084588
Name: price, Length: 59600, dtype: float64

In [18]:
scaled_df.to_csv('Scaled Cleaned Airbnb Data.csv')