In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('AirbnbData_Cleaned.csv')

In [3]:
df.dtypes

Unnamed: 0                          int64
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
city                               object
dtype: object

In [4]:
dummy = pd.get_dummies(df, columns=['city', 'room_type'], drop_first=True, prefix='DM')
print(dummy.columns)

Index(['Unnamed: 0', 'id', 'name', 'host_id', 'host_name',
       'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude',
       'price', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'DM_Los Angeles', 'DM_New York City',
       'DM_Rhode Island', 'DM_Seattle', 'DM_Hotel room', 'DM_Private room',
       'DM_Shared room'],
      dtype='object')


In [5]:
dummy.head()

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,...,reviews_per_month,calculated_host_listings_count,availability_365,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,48150,5065,MAUKA BB,7257,Wayne,Hawaii,Hamakua,20.04095,-155.43251,85,...,0.45,2,365,0,0,0,0,0,0,0
1,48151,5269,Upcountry Hospitality in the 'Auwai Suite,7620,Lea & Pat,Hawaii,South Kohala,20.0274,-155.702,124,...,0.09,5,261,0,0,0,0,0,0,0
2,48152,5387,Hale Koa Studio & 1 Bedroom Units!!,7878,Edward,Hawaii,South Kona,19.43119,-155.88079,85,...,1.3,3,242,0,0,0,0,0,0,0
3,48153,5389,Keauhou Villa,7878,Edward,Hawaii,North Kona,19.56413,-155.96347,239,...,0.24,3,287,0,0,0,0,0,0,0
4,48154,5390,STAY AT PRINCE KUHIO!,7887,Todd,Kauai,Koloa-Poipu,21.88305,-159.47372,92,...,1.03,1,116,0,0,0,0,0,0,0


In [6]:
scaler = StandardScaler()

In [7]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])

In [8]:
numeric_df.columns

Index(['Unnamed: 0', 'id', 'host_id', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [9]:
numeric_df = numeric_df.drop(columns=['Unnamed: 0', 'id', 'host_id', 'latitude', 'longitude'], axis=1)

In [10]:
numeric_df.columns

Index(['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [11]:
d = dummy.iloc[:, 16:]
d.head()

Unnamed: 0,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [12]:
y = numeric_df.iloc[:,0]
scaled_df = scaler.fit_transform(numeric_df.iloc[:,1:])

In [13]:
s = pd.DataFrame(scaled_df, dtype='object', columns=['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'])
u = pd.DataFrame(d, columns = ['DM_Los Angeles', 'DM_New York City',
       'DM_Rhode Island', 'DM_Seattle', 'DM_Hotel room', 'DM_Private room',
       'DM_Shared room'])
s.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,-0.34303,0.070068,-0.478757,-0.281507,1.464175
1,0.847045,-0.449823,-0.722354,-0.198516,0.728599
2,-0.215522,2.117138,0.096401,-0.253843,0.594215
3,-0.173019,-0.287357,-0.620855,-0.253843,0.912493
4,-0.300527,1.710973,-0.086297,-0.30917,-0.296964


In [14]:
u.head()

Unnamed: 0,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [15]:
u.shape

(85144, 7)

In [16]:
s.shape

(85144, 5)

In [17]:
X = [s, u]
X = pd.concat(X)
X.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,-0.34303,0.070068,-0.478757,-0.281507,1.464175,,,,,,,
1,0.847045,-0.449823,-0.722354,-0.198516,0.728599,,,,,,,
2,-0.215522,2.117138,0.096401,-0.253843,0.594215,,,,,,,
3,-0.173019,-0.287357,-0.620855,-0.253843,0.912493,,,,,,,
4,-0.300527,1.710973,-0.086297,-0.30917,-0.296964,,,,,,,


In [18]:
X.shape

(170288, 12)

In [19]:
X = X.replace(np.nan, 0)

In [20]:
X = X.iloc[:85144,:]
X

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,-0.343030,0.070068,-0.478757,-0.281507,1.464175,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.847045,-0.449823,-0.722354,-0.198516,0.728599,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.215522,2.117138,0.096401,-0.253843,0.594215,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.173019,-0.287357,-0.620855,-0.253843,0.912493,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.300527,1.710973,-0.086297,-0.309170,-0.296964,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
85139,-0.300527,-0.596043,-0.106596,-0.309170,-0.417203,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85140,-0.385533,-0.579796,0.570060,-0.253843,1.266135,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85141,-0.385533,-0.596043,-0.106596,-0.253843,1.365155,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85142,-0.385533,-0.579796,0.570060,-0.309170,1.216625,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=47)

In [22]:
X_train.shape, X_test.shape

((59600, 12), (25544, 12))

In [23]:
y_train.shape, y_test.shape

((59600,), (25544,))

In [24]:
X_train.dtypes

minimum_nights                    float64
number_of_reviews                 float64
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
DM_Los Angeles                    float64
DM_New York City                  float64
DM_Rhode Island                   float64
DM_Seattle                        float64
DM_Hotel room                     float64
DM_Private room                   float64
DM_Shared room                    float64
dtype: object

In [25]:
X_test.dtypes

minimum_nights                    float64
number_of_reviews                 float64
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
DM_Los Angeles                    float64
DM_New York City                  float64
DM_Rhode Island                   float64
DM_Seattle                        float64
DM_Hotel room                     float64
DM_Private room                   float64
DM_Shared room                    float64
dtype: object

In [26]:
y_train

62356     70
33342     32
76208     46
71673     53
6267     245
        ... 
80592     67
25267     40
23112     63
11528    140
51078    199
Name: price, Length: 59600, dtype: int64

In [31]:
scaled_df.to_csv('Scaled Cleaned Airbnb Data.csv')

In [32]:
numeric_df.to_csv('Numeric Data.csv')

In [27]:
X.to_csv('X Data.csv')