# Importing Relevant Python Libraries

In [1]:
# Data Analysis and Wrangling
import pandas as pd
import numpy as np
import random as rnd
import math

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Machine Learning
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Extracting Data

In [2]:
list1 = pd.read_csv('listings.csv')
list2 = pd.read_csv('listings2.csv')

# Data Size Metrics

In [3]:
#print(list1.columns.values)
print(list1.shape)

(3818, 16)


In [4]:
#list2.columns.values)
print(list2.shape)

(3818, 92)


In [65]:
# Join list1 and list2 to create the x vector
X = pd.concat([list1, list2], axis=1)
X.shape # Original Data Frame

(3818, 108)

# Feature Engineering

### Property Type

In [69]:
X=X[X['property_type'].notnull()]
X['property_type'].value_counts()

House              1733
Apartment          1708
Townhouse           118
Condominium          91
Loft                 40
Bed & Breakfast      37
Other                22
Cabin                21
Bungalow             13
Camper/RV            13
Boat                  8
Tent                  5
Treehouse             3
Dorm                  2
Chalet                2
Yurt                  1
Name: property_type, dtype: int64

In [7]:
X.shape

(3817, 108)

In [35]:
# Increasing the feature space
i=0
for item in X['property_type'].unique():
    a=X['property_type'] == X['property_type'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else 0)
    i=i+1
X.shape

(3817, 124)

### Bed Type

In [67]:
X=X[X['bed_type'].notnull()]
X['bed_type'].unique()

array(['Real Bed', 'Futon', 'Pull-out Sofa', 'Airbed', 'Couch'], dtype=object)

In [68]:
# Increasing the feature space
i=0
for item in X['bed_type'].unique():
    a=X['bed_type'] == X['bed_type'].unique()[i]
    X[item] = a.map(lambda x: 1 if x == True else 0)
    i=i+1
X.shape

(3818, 130)

### Amenities

In [38]:
Z1=X.loc[:, X.dtypes == np.float64] #Extracting columns with values of type float 
Z2=X.loc[:, X.dtypes == np.int64]   #Extracting columns with values of type int
X_numeric=pd.concat([Z1,Z2], axis=1)

In [39]:
# X_numeric has duplicate columns. The code below removes the duplicate columns
_, i = np.unique(X_numeric.columns, return_index=True)
X_Num_Cov=X_numeric.iloc[:, i]
X_Num_Cov.to_csv('Numerical_FS.csv')
X_Num_Cov.shape

(3817, 52)

In [40]:
space=['accommodates','bathrooms','bedrooms','beds','guests_included','price','reviews_per_month']
for item in X['property_type'].unique().tolist():
    space.append(item)

In [41]:
# Selecting certian numerical features
X_select = X_Num_Cov[space]
# Dropping the missing values from the feature space
X_select = X_select.dropna()
# Creating the output space
Y = X_select['reviews_per_month']
X_select = X_select.drop(['reviews_per_month'], axis = 1)
print(X_select.info())
X_select.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3173 entries, 0 to 3814
Data columns (total 22 columns):
accommodates       3173 non-null int64
bathrooms          3173 non-null float64
bedrooms           3173 non-null float64
beds               3173 non-null float64
guests_included    3173 non-null int64
price              3173 non-null int64
Apartment          3173 non-null int64
House              3173 non-null int64
Cabin              3173 non-null int64
Condominium        3173 non-null int64
Camper/RV          3173 non-null int64
Bungalow           3173 non-null int64
Townhouse          3173 non-null int64
Loft               3173 non-null int64
Boat               3173 non-null int64
Bed & Breakfast    3173 non-null int64
Other              3173 non-null int64
Dorm               3173 non-null int64
Treehouse          3173 non-null int64
Yurt               3173 non-null int64
Chalet             3173 non-null int64
Tent               3173 non-null int64
dtypes: float64(3), int64(19)

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,guests_included,price,Apartment,House,Cabin,Condominium,...,Townhouse,Loft,Boat,Bed & Breakfast,Other,Dorm,Treehouse,Yurt,Chalet,Tent
0,4,1.0,1.0,1.0,2,85,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,1.0,1.0,1.0,1,150,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11,4.5,5.0,7.0,10,975,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,2.0,3.0,3.0,6,450,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,1.0,1.0,1.0,1,120,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Data Visualization

# Preliminary Model Analysis

### Splitting data into training and testing sets

In [42]:
num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_select, Y, test_size=num_test, random_state=100)

### Ridge regression

In [43]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [44]:
y_test_predict=clf.predict(X_test)
np.size(y_test_predict)

635

In [45]:
y_train_predict=clf.predict(X_train)
np.size(y_train_predict)

2538

In [46]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_test = sqrt(mean_squared_error(y_test, y_test_predict))
rms_test

1.7569040285795503

In [47]:
rms_train = sqrt(mean_squared_error(y_train, y_train_predict))
rms_train

1.738389226693472

In [48]:
clf.score(X_test, y_test)

0.075958115429713202

In [49]:
clf.score(X_train, y_train)

0.088700612419382874

### Support Vector Machine

In [71]:
from sklearn import svm
