### Imports

In [160]:
# Note to self -> Use numpy somewhere
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [161]:
# Importing data into pandas dataframes
listings = pd.read_csv('./listings_sea.csv')
calendar = pd.read_csv('./calendar_sea.csv')
reviews = pd.read_csv('./reviews_sea.csv')

**Data Preparation**

In [191]:
calendar.shape

(1393570, 4)

In [163]:
# Select the columns that are needed - Listings
listings_data = listings.loc[:,('id',
                                'review_scores_rating',
                                'square_feet',
                                'bathrooms',
                                'bedrooms',
                                'price',
                                'property_type',
                                'neighbourhood_group_cleansed')]

In [None]:
# Merge with calendar dataset because there's more data in it???

In [164]:
# Impute for null values
listings_data.at[listings_data['property_type'].isnull(), 'property_type'] = 'Other'
listings_data.at[listings_data['review_scores_rating'].isnull(), 'review_scores_rating'] = 0

In [165]:
# Fix Data Types
listings_data['price'] = listings_data['price'].str.replace("[$, ]", "").astype("float")

In [166]:
# Check dtypes
listings_data.dtypes

id                                int64
review_scores_rating            float64
square_feet                     float64
bathrooms                       float64
bedrooms                        float64
price                           float64
property_type                    object
neighbourhood_group_cleansed     object
dtype: object

In [167]:
listings_data_categorical = pd.get_dummies(listings_data, drop_first=True);

In [168]:
# Check dtypes
listings_data_categorical.dtypes

id                                                    int64
review_scores_rating                                float64
square_feet                                         float64
bathrooms                                           float64
bedrooms                                            float64
price                                               float64
property_type_Bed & Breakfast                         uint8
property_type_Boat                                    uint8
property_type_Bungalow                                uint8
property_type_Cabin                                   uint8
property_type_Camper/RV                               uint8
property_type_Chalet                                  uint8
property_type_Condominium                             uint8
property_type_Dorm                                    uint8
property_type_House                                   uint8
property_type_Loft                                    uint8
property_type_Other                     

In [177]:
# Check dataframe for null values
listings_data_categorical.isnull().sum()

id                                                     0
review_scores_rating                                   0
square_feet                                         3721
bathrooms                                             16
bedrooms                                               6
price                                                  0
property_type_Bed & Breakfast                          0
property_type_Boat                                     0
property_type_Bungalow                                 0
property_type_Cabin                                    0
property_type_Camper/RV                                0
property_type_Chalet                                   0
property_type_Condominium                              0
property_type_Dorm                                     0
property_type_House                                    0
property_type_Loft                                     0
property_type_Other                                    0
property_type_Tent             

In [178]:
listings_data_categorical_clean = listings_data_categorical.dropna(axis = 0)

**Start Model**

In [179]:
# Consider only numerical variables
feature_set = listings_data_categorical_clean.drop(['id','price'], axis=1)

X = feature_set
y = listings_data_categorical_clean['price']

In [180]:
X.shape[0]

94

In [181]:
y.shape[0]

94

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

**Four steps:**

In [183]:
#Instantiate
lm_model = LinearRegression(normalize=True) 

In [184]:
#Fit
lm_model.fit(X_train, y_train) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [185]:
#Predict
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

In [186]:
#Score
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

# return test_score, train_score, lm_model, X_train, X_test, y_train, y_test

In [187]:
#Print training and testing score
print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))

The rsquared on the training data was 0.8071694180391565.  The rsquared on the test data was -2.2341135972167197e+28.


In [None]:
# Try RandomForestRegressor -> Found here: http://localhost:8888/notebooks/Documents/py_proj/udacity-ds-blog-post-sample-3/Seattle_airbnb.ipynb

In [None]:
# Do that correlation coeffecient thing -> Found here: http://localhost:8888/notebooks/Documents/py_proj/udacity-ds-blog-post-sample-3/Seattle_airbnb.ipynb