<h1>Using Renthop data to find whether a rental listing is of High, Medium or low interest</h1>

In [1]:
#begin with standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import modeling algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import log_loss



In [25]:
train_df = pd.read_json('renthop/train.json/train.json', encoding='ascii')
test_df = pd.read_json('renthop/test.json/test.json', encoding = 'ascii')
full_df = train_df.append(test_df, ignore_index = True)
df = full_df[:49352]
print('The test dataset:', test_df.shape, 'The train dataset', train_df.shape, 'The whole dataset: ', full_df.shape)

('The test dataset:', (74659, 14), 'The train dataset', (49352, 15), 'The whole dataset: ', (124011, 15))


<h4>The test dataset has one less column because the train dataset includes the interest level</h4>

In [18]:
df.shape

(49352, 15)

<h3>Convert the 'created' column to date time for easier analysis</h3>

In [15]:
full_df['created']=pd.to_datetime(df['created'])
full_df['year']=full_df['created'].dt.year
full_df['month'] = full_df['created'].dt.month
full_df['day'] = full_df['created'].dt.day

<h3>Since our dataframe is full of categorical variables the easiest way to fit those into a model algorithm is to simply take the amount of those variables i.e. the more 'features' the better</h3> 

In [4]:
full_df['len_photos'] = full_df['photos'].apply(len)
full_df['len_features'] = full_df['features'].apply(len)
full_df['len_desc'] = df['description'].apply(lambda x: len(x.split(" ")))

In [6]:
cols_to_keep = ['bathrooms', 'bedrooms', 'latitude', 
                'longitude', 'price', 'len_photos', 
                'len_features', 'len_desc', 'year', 
                'month', 'day']
df = full_df[:49352]

In [7]:
X = df[cols_to_keep]
y = df['interest_level']
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.33)

In [31]:
model = RandomForestClassifier(n_estimators=1000)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

0.63200451267855529

In [32]:
model = GradientBoostingClassifier()
clf.model(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)

0.64302378233253077

In [33]:
model = GaussianNB()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

1.9071312968953897

In [34]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

  np.exp(prob, prob)


0.72439762583433065

In [35]:
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

6.0017451126330039

<h1>Now that we have a baseline Logloss let's do some feature engineering</h1>

In [26]:
full_df['features'].head(10)

0                                                   []
1    [Doorman, Elevator, Fitness Center, Cats Allow...
2    [Laundry In Building, Dishwasher, Hardwood Flo...
3                            [Hardwood Floors, No Fee]
4                                            [Pre-War]
5                                                   []
6    [prewar, elevator, Dogs Allowed, Cats Allowed,...
7    [Doorman, Elevator, Pre-War, Terrace, Laundry ...
8    [Cats Allowed, Dogs Allowed, Elevator, Laundry...
9                        [Dishwasher, Hardwood Floors]
Name: features, dtype: object

In [23]:
type(full_df['features'])

pandas.core.series.Series

In [49]:
def clean(s):
    for x in s:
        x = x.replace("-", " ")
        x = x.replace("_", " ")
        x = x.replace("&", "and")
        x = x.replace("24/7", "24")
        x = x.replace("24hr", "24")
        x = x.replace("24hour", "24")
        x = x.replace("24 hour", "24")
        x = x.replace("24-hour", "24")
        x = x.replace("a/c", "aircon")
        x = x.replace("air conditioner", "aircon")
        x = x.replace("bicycle", "bike")
        x = x.replace("concierge service", "doorman")
        x = x.replace("concierge", "doorman")
        x = x.replace("counter tops", "counters")
        x = x.replace("countertops", "counters")
        x = x.replace("granite kitchen", "granite counters")
        x = x.replace("dish washer", "dishwasher")
        x = x.replace("full time", "ft")
        x = x.replace("full-time", "ft")
        x = x.replace("indoor swimming pool", "indoor pool")
        x = x.replace("laundry on every floor", "laundry on floor")
        x = x.replace("media screening room", "media room")
        x = x.replace("no fee", "")
        x = x.replace("one month free rent", "one month free")
        x = x.replace("prewar", "pre war")
        x = x.replace("roofdeck", "roof deck")
        x = x.replace("ss appliance", "stainless")
        x = x.replace("storage facilities", "storage")
        x = x.replace("twenty four hour", "24")
        x = x.replace("washer and dryer", "washer/dryer")
        x = x.replace("wi fi", "wifi")
        return x


In [44]:
features = full_df['features'].str.lower()

In [45]:
features.head()

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: features, dtype: float64

In [None]:
for x in full_df['features']:
    full_df['featured'] = clean(x)

In [None]:
full_df['features'].head()