<h1>Using Renthop data to find whether a rental listing is of High, Medium or low interest</h1>

In [40]:
#begin with standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import operator
import os
import sys
%matplotlib inline

#import modeling algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
train_df = pd.read_json('renthop/train.json/train.json')
test_df = pd.read_json('renthop/test.json/test.json')
full_df = train_df.append(test_df, ignore_index = True)
df = full_df[:49352]
print('The test dataset:', test_df.shape, 'The train dataset', train_df.shape, 'The whole dataset: ', full_df.shape)

('The test dataset:', (74659, 14), 'The train dataset', (49352, 15), 'The whole dataset: ', (124011, 15))


<h4>The test dataset has one less column because the train dataset includes the interest level</h4>

In [4]:
df.shape

(49352, 15)

<h3>Convert the 'created' column to date time for easier analysis</h3>

In [3]:
full_df['created']=pd.to_datetime(df['created'])
full_df['month'] = full_df['created'].dt.month
full_df['day'] = full_df['created'].dt.day

<h3>Helper function to reduce the amount of unique words in the features columns</h3>

In [4]:
def clean(a):
    a = str(a)
    a = a.replace('-', ' ')#etc.etc.
    a = a.replace('_', ' ')
    a = a.replace('&', 'and')
    a = a.replace('24/7', '24')
    a = a.replace('24hr', '24')
    a = a.replace('24hour', '24')
    a = a.replace('24 hour', '24')
    a = a.replace('a/c', 'aircon')
    a = a.replace('air conditioner', 'aircon')
    a = a.replace('bicycle', 'bike')
    a = a.replace('concierge', 'doorman')
    a = a.replace('concierge service', 'doorman')
    a = a.replace('counter tops', 'counters')
    a = a.replace('countertops', 'counters')
    a = a.replace('granite kitchen', 'granite counters')
    a = a.replace('dish washer', 'dishwasher')
    a = a.replace('full time', 'ft')
    a = a.replace('indoor swimming pool', 'indoor pool')
    a = a.replace('laundry on every floor', 'laundry on floor')
    a = a.replace('media screening room', 'media room')
    a = a.replace('one month free rent', 'one month free')
    a = a.replace('prewar', 'pre war')
    a = a.replace('roofdeck', 'roof deck')
    a = a.replace('ss appliance', 'stainless')
    a = a.replace('storage facilities', 'storage')
    a = a.replace('twenty four hour', '24')
    a = a.replace('washer and dryer', 'washer/dryer')
    a = a.replace('wi fi', 'wifi')
    return a   

<h3>Since our dataframe is full of categorical variables the easiest way to fit those into a model algorithm is to simply take the amount of those variables i.e. the more 'features' the better</h3> 

In [5]:
full_df['len_photos'] = full_df['photos'].apply(len)
full_df['len_features'] = full_df['features'].apply(len)
full_df['len_desc'] = df['description'].apply(lambda x: len(x.split(" ")))

In [40]:
cols_to_keep = ['bathrooms', 'bedrooms', 'latitude', 
                'longitude', 'price', 'len_photos', 
                'len_features', 'len_desc',  
                'month', 'day']
df = full_df[:49352]

<h3>Split the training dataframe into further sections for tuning our models</h3>

In [30]:
X = df[cols_to_keep]
y = df['interest_level']
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.33)

In [9]:
model = RandomForestClassifier(n_estimators=1000)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

0.64024243797863556

In [34]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

0.64382598228289722

In [36]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

0.71865436526725646

<h1>Now that we have a baseline Logloss let's do some feature engineering</h1>

In [6]:
import unicodedata

full_df['featured'] = full_df['features'].map (lambda each_row: [unicodedata.normalize('NFKD', x).encode('ascii','ignore') for x in each_row])

In [7]:
full_df[['featured']] = full_df[["featured"]].apply(
    lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])

In [8]:
full_df['featured'] = full_df['featured'].apply(lambda x: clean(x))#MM

In [9]:
df = full_df[:49352]

In [10]:
new_cols_to_keep = ['bathrooms', 'bedrooms', 'latitude',
                'longitude', 'price', 'len_photos', 
                'len_desc','month', 'day']

In [11]:
df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,manager_id,photos,price,street_address,month,day,len_photos,len_features,len_desc,featured
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,6.0,24.0,5,0,95.0,[]
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,6.0,12.0,11,5,9.0,"['doorman', 'elevator', 'fitness center', 'cat..."
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,4.0,17.0,8,4,94.0,"['laundry in building', 'dishwasher', 'hardwoo..."
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,4.0,18.0,3,2,80.0,"['hardwood floors', 'no fee']"
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,4.0,28.0,3,1,68.0,['pre war']


In [13]:
featuref = df['featured'].fillna('')
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features = 1000,
                            ngram_range = (1, 2),
                            stop_words = 'english',
                            binary = True)

#fit learns vocabulary of Featured column
vectorizer.fit(featuref)
#transform into a matrix
X = vectorizer.transform(featuref)

<h3>Split the model data again to train the model in this case Random Forest performed best so we will test that with our new vectorized features</h3>

In [27]:
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
from sklearn.cross_validation import cross_val_score

model = RandomForestClassifier(n_estimators = 20)

vectorizer.fit(featuref)

X_features = vectorizer.transform(featuref)
other_features = df[new_cols_to_keep]
X = hstack((X_features, other_features)).toarray()
y = df['interest_level']

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.33)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

1.2198210054351168

In [30]:
all_feature_names = vectorizer.get_feature_names() + new_cols_to_keep
feature_importances = pd.DataFrame({'Features' : all_feature_names, 'Importance Score': model.feature_importances_})
feature_importances.sort_values('Importance Score', ascending=False).head()

Unnamed: 0,Features,Importance Score
1004,price,0.114305
1008,day,0.089031
1006,len_desc,0.085884
1003,longitude,0.084521
1002,latitude,0.084402


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 1000,
                            ngram_range = (1, 2),
                            binary = True)

#fit learn's vocabulary of Featured column
vectorizer.fit(featuref)
#transform into a matrix
X = vectorizer.transform(featuref)

model = RandomForestClassifier(n_estimators = 20)

vectorizer.fit(featuref)

X_tf_features = vectorizer.transform(featuref)
other_features = df[new_cols_to_keep]
X = hstack((X_tf_features, other_features)).toarray()
y = df['interest_level']

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.33)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

1.3717348085212038

In [33]:
#got a little better performance with CountVectorizer let's try it with description

vectorizer = CountVectorizer(max_features = 1000,
                            ngram_range = (1, 2),
                            stop_words = 'english',
                            binary = True)

descriptive = df['description'].fillna('')

#fit learn's vocabulary of Featured column
vectorizer.fit(descriptive)
#transform into a matrix
X = vectorizer.transform(descriptive)

model = RandomForestClassifier(n_estimators = 20)

vectorizer.fit(descriptive)

X_desc_features = vectorizer.transform(descriptive)
other_features = df[new_cols_to_keep]
X = hstack((X_desc_features, other_features)).toarray()
y = df['interest_level']

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.33)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

1.2849345070269527

<h3>Trying another approach to vectorizing features</h3>

In [34]:
from sklearn_pandas import DataFrameMapper
mapper = DataFrameMapper([
    ('featured', CountVectorizer(binary=True, ngram_range=(1, 2)))
    
])
features_sparse=mapper.fit_transform(df)

In [35]:
X = sparse.hstack([df[new_cols_to_keep], features_sparse]).tocsr()
y = df['interest_level']
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.33)

In [36]:
model = RandomForestClassifier(n_estimators=1000)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

0.66724980942022849

In [38]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

  np.exp(prob, prob)


0.68080468520226967

In [39]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val.toarray())
log_loss(y_val, y_val_pred)

0.62762756094982386

<h3>Our log loss went up again but not as much as the other vectorized variables</h3>

In [49]:
newer_cols_to_keep = ['bathrooms', 'bedrooms', 'latitude',
                'longitude', 'price', 'len_photos', 
                'day']

In [50]:
X = sparse.hstack([df[new_cols_to_keep], features_sparse]).tocsr()
y = df['interest_level']
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.33)

In [51]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val.toarray())
log_loss(y_val, y_val_pred)

0.6262994932289272

<h1>Our Log Loss went down with our optimal vectors<h1>