In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Trimmed Data.csv')

In [3]:
df.dtypes

Unnamed: 0                          int64
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
city                               object
dtype: object

In [4]:
dummy = pd.get_dummies(df, columns=['city', 'room_type'], prefix='DM')
print(dummy.columns)

Index(['Unnamed: 0', 'id', 'name', 'host_id', 'host_name',
       'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude',
       'price', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'DM_Hawaii', 'DM_Los Angeles', 'DM_New York City',
       'DM_Rhode Island', 'DM_Seattle', 'DM_Entire home/apt', 'DM_Hotel room',
       'DM_Private room', 'DM_Shared room'],
      dtype='object')


In [5]:
dummy.head()

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,...,availability_365,DM_Hawaii,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Entire home/apt,DM_Hotel room,DM_Private room,DM_Shared room
0,1,5269,Upcountry Hospitality in the 'Auwai Suite,7620,Lea & Pat,Hawaii,South Kohala,20.0274,-155.702,124,...,261,1,0,0,0,0,1,0,0,0
1,3,5389,Keauhou Villa,7878,Edward,Hawaii,North Kona,19.56413,-155.96347,239,...,287,1,0,0,0,0,1,0,0,0
2,6,7896,Beachfront Maui Sunset Condo,21844,Caroline,Maui,Kihei-Makena,20.7547,-156.45666,120,...,344,1,0,0,0,0,1,0,0,0
3,8,9877,Keolamauloa Homestead,33179,Kaye,Hawaii,Hamakua,20.02889,-155.40747,120,...,49,1,0,0,0,0,1,0,0,0
4,10,13523,"All Inclusive that has it all, Sleeps 8, Hot Tub",52931,Mark,Honolulu,Koolaupoko,21.3393,-157.70216,221,...,345,1,0,0,0,0,1,0,0,0


In [6]:
dummy.sample(5)

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,...,availability_365,DM_Hawaii,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Entire home/apt,DM_Hotel room,DM_Private room,DM_Shared room
56285,75732,43547204,Chic Two Bedroom Soho Loft,25597316,Mike,Manhattan,SoHo,40.72056,-73.99956,225,...,351,0,0,1,0,0,1,0,0,0
31300,45740,4549555,"Studio Space, 10 minutes to Central Park and r...",6571805,Agata,Manhattan,Upper West Side,40.79407,-73.97679,45,...,364,0,0,1,0,0,0,0,0,1
45755,63142,26744721,"Gorgeous studio ""Hamptons style"" in West Village",8305252,Lauren,Manhattan,West Village,40.73611,-74.00922,150,...,0,0,0,1,0,0,1,0,0,0
12946,19963,6515714,Charming Studio w/lots of character,34070047,Levi,City of Los Angeles,Koreatown,34.0761,-118.29572,95,...,0,0,1,0,0,0,1,0,0,0
62080,84110,34841635,The Blue room in Seattle,55604923,Melinda,Lake City,Matthews Beach,47.71243,-122.28373,57,...,89,0,0,0,0,1,0,0,1,0


In [7]:
dummy.shape

(62808, 25)

In [8]:
scaler = StandardScaler()

In [9]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])

In [10]:
numeric_df.columns

Index(['Unnamed: 0', 'id', 'host_id', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [11]:
numeric_df = numeric_df.drop(columns=['Unnamed: 0', 'id', 'host_id', 'latitude', 'longitude'], axis=1)

In [12]:
numeric_df.columns

Index(['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [15]:
d = dummy.iloc[:, 16:]
d.head()

Unnamed: 0,DM_Hawaii,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Entire home/apt,DM_Hotel room,DM_Private room,DM_Shared room
0,1,0,0,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,0
2,1,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0
4,1,0,0,0,0,1,0,0,0


In [16]:
y = numeric_df.iloc[:,0]
scaled_df = scaler.fit_transform(numeric_df.iloc[:,1:])

In [17]:
s = pd.DataFrame(scaled_df, dtype='object', columns=['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'])
u = pd.DataFrame(d, columns = ['DM_Hawaii', 'DM_Los Angeles', 'DM_New York City',
       'DM_Rhode Island', 'DM_Seattle', 'DM_Entire home/apt', 'DM_Hotel room', 'DM_Private room',
       'DM_Shared room'])
s.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,1.80504,-0.405302,-0.877451,-0.009224,0.944019
1,-0.260837,0.021252,-0.660446,-0.204222,1.135798
2,-0.432994,0.362494,-0.674913,-0.39922,1.556235
3,-0.519072,0.362494,-0.284304,-0.39922,-0.619712
4,-0.174759,0.191873,-0.371106,0.57577,1.563611


In [18]:
u.head()

Unnamed: 0,DM_Hawaii,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Entire home/apt,DM_Hotel room,DM_Private room,DM_Shared room
0,1,0,0,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,0
2,1,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0
4,1,0,0,0,0,1,0,0,0


In [19]:
u.shape

(62808, 9)

In [20]:
s.shape

(62808, 5)

In [21]:
X = pd.merge(s, u, right_index = True, left_index = True)

In [22]:
X.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,DM_Hawaii,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Entire home/apt,DM_Hotel room,DM_Private room,DM_Shared room
0,1.80504,-0.405302,-0.877451,-0.009224,0.944019,1,0,0,0,0,1,0,0,0
1,-0.260837,0.021252,-0.660446,-0.204222,1.135798,1,0,0,0,0,1,0,0,0
2,-0.432994,0.362494,-0.674913,-0.39922,1.556235,1,0,0,0,0,1,0,0,0
3,-0.519072,0.362494,-0.284304,-0.39922,-0.619712,1,0,0,0,0,1,0,0,0
4,-0.174759,0.191873,-0.371106,0.57577,1.563611,1,0,0,0,0,1,0,0,0


In [23]:
X.shape

(62808, 14)

In [24]:
X.isnull().sum()

minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
DM_Hawaii                         0
DM_Los Angeles                    0
DM_New York City                  0
DM_Rhode Island                   0
DM_Seattle                        0
DM_Entire home/apt                0
DM_Hotel room                     0
DM_Private room                   0
DM_Shared room                    0
dtype: int64

In [25]:
X.columns

Index(['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365', 'DM_Hawaii',
       'DM_Los Angeles', 'DM_New York City', 'DM_Rhode Island', 'DM_Seattle',
       'DM_Entire home/apt', 'DM_Hotel room', 'DM_Private room',
       'DM_Shared room'],
      dtype='object')

In [26]:
X['DM_Los Angeles'].value_counts()

0    45776
1    17032
Name: DM_Los Angeles, dtype: int64

In [27]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=47)

In [28]:
X_train.shape, X_test.shape

((43965, 14), (18843, 14))

In [29]:
y_train.shape, y_test.shape

((43965,), (18843,))

In [30]:
X_train.dtypes

minimum_nights                    object
number_of_reviews                 object
reviews_per_month                 object
calculated_host_listings_count    object
availability_365                  object
DM_Hawaii                          uint8
DM_Los Angeles                     uint8
DM_New York City                   uint8
DM_Rhode Island                    uint8
DM_Seattle                         uint8
DM_Entire home/apt                 uint8
DM_Hotel room                      uint8
DM_Private room                    uint8
DM_Shared room                     uint8
dtype: object

In [31]:
X_test.dtypes

minimum_nights                    object
number_of_reviews                 object
reviews_per_month                 object
calculated_host_listings_count    object
availability_365                  object
DM_Hawaii                          uint8
DM_Los Angeles                     uint8
DM_New York City                   uint8
DM_Rhode Island                    uint8
DM_Seattle                         uint8
DM_Entire home/apt                 uint8
DM_Hotel room                      uint8
DM_Private room                    uint8
DM_Shared room                     uint8
dtype: object

In [32]:
y_train

2158     150
54126    215
54600    120
12314    150
4325     235
        ... 
23112     40
11528    120
47431    150
51078    100
38023     85
Name: price, Length: 43965, dtype: int64

In [33]:
numeric_df.to_csv('Numeric Data.csv')

In [34]:
X.to_csv('X Data.csv')