<h1>Using Renthop data to find whether a rental listing is of High, Medium or low interest</h1>

In [3]:
#begin with standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import modeling algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
train_df = pd.read_json('renthop/train.json/train.json')
test_df = pd.read_json('renthop/test.json/test.json')
full_df = train_df.append(test_df, ignore_index = True)
df = full_df[:49352]
print('The test dataset:', test_df.shape, 'The train dataset', train_df.shape, 'The whole dataset: ', full_df.shape)

('The test dataset:', (74659, 14), 'The train dataset', (49352, 15), 'The whole dataset: ', (124011, 15))


<h4>The test dataset has one less column because the train dataset includes the interest level</h4>

In [4]:
df.shape

(49352, 15)

<h3>Convert the 'created' column to date time for easier analysis</h3>

In [5]:
full_df['created']=pd.to_datetime(df['created'])
full_df['year']=full_df['created'].dt.year
full_df['month'] = full_df['created'].dt.month
full_df['day'] = full_df['created'].dt.day

<h3>Since our dataframe is full of categorical variables the easiest way to fit those into a model algorithm is to simply take the amount of those variables i.e. the more 'features' the better</h3> 

In [6]:
full_df['len_photos'] = full_df['photos'].apply(len)
full_df['len_features'] = full_df['features'].apply(len)
full_df['len_desc'] = df['description'].apply(lambda x: len(x.split(" ")))

In [7]:
cols_to_keep = ['bathrooms', 'bedrooms', 'latitude', 
                'longitude', 'price', 'len_photos', 
                'len_features', 'len_desc', 'year', 
                'month', 'day']
df = full_df[:49352]

In [8]:
X = df[cols_to_keep]
y = df['interest_level']
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.33)

In [9]:
model = RandomForestClassifier(n_estimators=1000)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

0.64024243797863556

In [None]:
model = GradientBoostingClassifier()
clf.model(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

In [35]:
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)
log_loss(y_val, y_val_pred)

6.0017451126330039

<h1>Now that we have a baseline Logloss let's do some feature engineering</h1>

In [None]:
for each in full_df['features']:
    for item in each:
        item.encode('utf-8')
        full_df['features']== item
return item

In [10]:
import unicodedata

full_df['features'] = full_df['features'].map (lambda each_row: [unicodedata.normalize('NFKD', x).encode('ascii','ignore') for x in each_row])

In [11]:
full_df[['features1']] = full_df[["features"]].apply(
    lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])

In [12]:
full_df['features1'].head()

0                                                   []
1    [doorman, elevator, fitness center, cats allow...
2    [laundry in building, dishwasher, hardwood flo...
3                            [hardwood floors, no fee]
4                                            [pre-war]
Name: features1, dtype: object

In [13]:
def clean(s):
    if s is None:
        return ""
    else:
        for a in s:
            a = [x.replace('-',' ') for x in a]
            a = [x.replace('_',' ') for x in a]
            a = [x.replace('&','and') for x in a]
            a = [x.replace('24/7','24') for x in a]
            a = [x.replace('24hr','24') for x in a]
            a = [x.replace('24hour','24') for x in a]
            a = [x.replace('24 hour','24') for x in a]
            a = [x.replace('a/c','aircon') for x in a]
            a = [x.replace('air conditioner','aircon') for x in a]
            a = [x.replace('bicycle','bike') for x in a]
            a = [x.replace('concierge service','doorman') for x in a]
            a = [x.replace('concierge','doorman') for x in a]
            a = [x.replace('counter tops', 'counters') for x in a]
            a = [x.replace('countertops', 'counters') for x in a]
            a = [x.replace('granite kitchen', 'granite counters') for x in a]
            a = [x.replace('dish washer', 'dishwasher') for x in a]
            a = [x.replace('full time', 'ft') for x in a]
            a = [x.replace('indoor swimming pool', 'indoor pool') for x in a]
            a = [x.replace('laundry on every floor', 'laundry on floor') for x in a]
            a = [x.replace('media screening room', 'media room') for x in a]
            a = [x.replace('no fee', '') for x in a]
            a = [x.replace('one month free rent', 'one month free') for x in a]
            a = [x.replace('prewar', 'pre war') for x in a]
            a = [x.replace('roofdeck', 'roof deck') for x in a]
            a = [x.replace('ss appliance', 'stainless') for x in a]
            a = [x.replace('storage facilities', 'storage') for x in a]
            a = [x.replace('twenty four hour', '24') for x in a]
            a = [x.replace('washer and dryer', 'washer/dryer') for x in a]
            a = [x.replace('wi fi', 'wifi') for x in a]
            return a


In [14]:
for x in full_df['features1']:
    full_df['features1'] = clean(x)

In [15]:
full_df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,photos,price,street_address,year,month,day,len_photos,len_features,len_desc,features1
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,2016.0,6.0,24.0,5,0,95.0,
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,2016.0,6.0,12.0,11,5,9.0,
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,2016.0,4.0,17.0,8,4,94.0,
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,2016.0,4.0,18.0,3,2,80.0,
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,2016.0,4.0,28.0,3,1,68.0,


In [25]:
CV = CountVectorizer(stop_words='english', max_features=200)
full_df['features_vectorized']=cv.fit_transorm(train_df['features1'])

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,year,month,day,features1
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,2016.0,6.0,24.0,f
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,2016.0,6.0,12.0,f
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,2016.0,4.0,17.0,f
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,2016.0,4.0,18.0,f
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,2016.0,4.0,28.0,f
