In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Trimmed Data.csv')

In [3]:
df.dtypes

Unnamed: 0                          int64
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
city                               object
dtype: object

In [4]:
dummy = pd.get_dummies(df, columns=['city', 'room_type'], drop_first=True, prefix='DM')
print(dummy.columns)

Index(['Unnamed: 0', 'id', 'name', 'host_id', 'host_name',
       'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude',
       'price', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'DM_Los Angeles', 'DM_New York City',
       'DM_Rhode Island', 'DM_Seattle', 'DM_Hotel room', 'DM_Private room',
       'DM_Shared room'],
      dtype='object')


In [5]:
dummy.head()

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,...,reviews_per_month,calculated_host_listings_count,availability_365,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,1,5269,Upcountry Hospitality in the 'Auwai Suite,7620,Lea & Pat,Hawaii,South Kohala,20.0274,-155.702,124,...,0.09,5,261,0,0,0,0,0,0,0
1,3,5389,Keauhou Villa,7878,Edward,Hawaii,North Kona,19.56413,-155.96347,239,...,0.24,3,287,0,0,0,0,0,0,0
2,6,7896,Beachfront Maui Sunset Condo,21844,Caroline,Maui,Kihei-Makena,20.7547,-156.45666,120,...,0.23,1,344,0,0,0,0,0,0,0
3,8,9877,Keolamauloa Homestead,33179,Kaye,Hawaii,Hamakua,20.02889,-155.40747,120,...,0.5,1,49,0,0,0,0,0,0,0
4,10,13523,"All Inclusive that has it all, Sleeps 8, Hot Tub",52931,Mark,Honolulu,Koolaupoko,21.3393,-157.70216,221,...,0.44,11,345,0,0,0,0,0,0,0


In [6]:
scaler = StandardScaler()

In [7]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])

In [8]:
numeric_df.columns

Index(['Unnamed: 0', 'id', 'host_id', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [9]:
numeric_df = numeric_df.drop(columns=['Unnamed: 0', 'id', 'host_id', 'latitude', 'longitude'], axis=1)

In [10]:
numeric_df.columns

Index(['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [11]:
d = dummy.iloc[:, 16:]
d.head()

Unnamed: 0,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [12]:
y = numeric_df.iloc[:,0]
scaled_df = scaler.fit_transform(numeric_df.iloc[:,1:])

In [13]:
s = pd.DataFrame(scaled_df, dtype='object', columns=['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'])
u = pd.DataFrame(d, columns = ['DM_Los Angeles', 'DM_New York City',
       'DM_Rhode Island', 'DM_Seattle', 'DM_Hotel room', 'DM_Private room',
       'DM_Shared room'])
s.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,1.80504,-0.405302,-0.877451,-0.009224,0.944019
1,-0.260837,0.021252,-0.660446,-0.204222,1.135798
2,-0.432994,0.362494,-0.674913,-0.39922,1.556235
3,-0.519072,0.362494,-0.284304,-0.39922,-0.619712
4,-0.174759,0.191873,-0.371106,0.57577,1.563611


In [14]:
u.head()

Unnamed: 0,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [15]:
u.shape

(62808, 7)

In [16]:
s.shape

(62808, 5)

In [17]:
X = [s, u]
X = pd.concat(X)
X.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,1.80504,-0.405302,-0.877451,-0.009224,0.944019,,,,,,,
1,-0.260837,0.021252,-0.660446,-0.204222,1.135798,,,,,,,
2,-0.432994,0.362494,-0.674913,-0.39922,1.556235,,,,,,,
3,-0.519072,0.362494,-0.284304,-0.39922,-0.619712,,,,,,,
4,-0.174759,0.191873,-0.371106,0.57577,1.563611,,,,,,,


In [18]:
X.shape

(125616, 12)

In [19]:
X = X.replace(np.nan, 0)

In [20]:
X = X.iloc[:62808,:]
X

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,DM_Los Angeles,DM_New York City,DM_Rhode Island,DM_Seattle,DM_Hotel room,DM_Private room,DM_Shared room
0,1.805040,-0.405302,-0.877451,-0.009224,0.944019,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.260837,0.021252,-0.660446,-0.204222,1.135798,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.432994,0.362494,-0.674913,-0.399220,1.556235,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.519072,0.362494,-0.284304,-0.399220,-0.619712,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.174759,0.191873,-0.371106,0.575770,1.563611,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
62803,-0.519072,-0.789200,0.439047,-0.399220,-0.250907,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62804,-0.691228,-0.746545,1.885747,-0.204222,1.504602,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62805,-0.691228,-0.789200,0.439047,-0.204222,1.607867,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62806,-0.691228,-0.746545,1.885747,-0.399220,1.452969,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=47)

In [22]:
X_train.shape, X_test.shape

((43965, 12), (18843, 12))

In [23]:
y_train.shape, y_test.shape

((43965,), (18843,))

In [24]:
X_train.dtypes

minimum_nights                    float64
number_of_reviews                 float64
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
DM_Los Angeles                    float64
DM_New York City                  float64
DM_Rhode Island                   float64
DM_Seattle                        float64
DM_Hotel room                     float64
DM_Private room                   float64
DM_Shared room                    float64
dtype: object

In [25]:
X_test.dtypes

minimum_nights                    float64
number_of_reviews                 float64
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
DM_Los Angeles                    float64
DM_New York City                  float64
DM_Rhode Island                   float64
DM_Seattle                        float64
DM_Hotel room                     float64
DM_Private room                   float64
DM_Shared room                    float64
dtype: object

In [26]:
y_train

2158     150
54126    215
54600    120
12314    150
4325     235
        ... 
23112     40
11528    120
47431    150
51078    100
38023     85
Name: price, Length: 43965, dtype: int64

In [28]:
numeric_df.to_csv('Numeric Data.csv')

In [29]:
X.to_csv('X Data.csv')