In [1]:
import pandas as pd 
import numpy as np 
import os 
import matplotlib as plt
import seaborn as sns
%matplotlib inline

In [3]:
from subprocess import check_output
#print(check_output(["ls", "."]).decode("utf8"))

In [5]:
train = pd.read_json(r'train.json')
test = pd.read_json(r'test.json')

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49352 entries, 10 to 99994
Data columns (total 15 columns):
bathrooms          49352 non-null float64
bedrooms           49352 non-null int64
building_id        49352 non-null object
created            49352 non-null object
description        49352 non-null object
display_address    49352 non-null object
features           49352 non-null object
interest_level     49352 non-null object
latitude           49352 non-null float64
listing_id         49352 non-null int64
longitude          49352 non-null float64
manager_id         49352 non-null object
photos             49352 non-null object
price              49352 non-null int64
street_address     49352 non-null object
dtypes: float64(3), int64(3), object(9)
memory usage: 6.0+ MB


### Specific Categories to Impliment 

- the number of photos on listing 
- the number of features 
- the length of the description 
- convert the created into datetime object and extract day / month (year doesn't matter, hours doesn't matter) 

In [7]:
def quantify_features(data):
    data['num_photos'] = data['photos'].apply(len)
    data['num_features'] = data['features'].apply(len)
    data['description_length'] = data.description.apply(lambda x: len(x.split()))
    data['created'] = pd.to_datetime(data['created'])
    data['created_month'] = data['created'].dt.month
    data['created_day'] = data['created'].dt.day
    print data.info()

In [8]:
quantify_features(train)
quantify_features(test)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49352 entries, 10 to 99994
Data columns (total 20 columns):
bathrooms             49352 non-null float64
bedrooms              49352 non-null int64
building_id           49352 non-null object
created               49352 non-null datetime64[ns]
description           49352 non-null object
display_address       49352 non-null object
features              49352 non-null object
interest_level        49352 non-null object
latitude              49352 non-null float64
listing_id            49352 non-null int64
longitude             49352 non-null float64
manager_id            49352 non-null object
photos                49352 non-null object
price                 49352 non-null int64
street_address        49352 non-null object
num_photos            49352 non-null int64
num_features          49352 non-null int64
description_length    49352 non-null int64
created_month         49352 non-null int64
created_day           49352 non-null int64
dtypes:

In [9]:
# for now, just use the old method, later, we can transform the categorical information using scikit learn
# use cat.codes 

def cat_codes(X):
    X["street_address"] = X["street_address"].astype('category').cat.codes
    X["building_id"] = X["building_id"].astype('category').cat.codes
    X["description"] = X["description"].astype('category').cat.codes
    X["display_address"] = X["display_address"].astype('category').cat.codes
    X["manager_id"] = X["manager_id"].astype('category').cat.codes


In [10]:
cat_codes(train)
cat_codes(test)

In [25]:
# featureset 
features = [ 'description', 'display_address',  'created_day', 
           'created_month', 'num_photos', 'num_features', 'description_length', 'longitude', 'latitude', 'price',
           'bathrooms', 'bedrooms']

In [26]:
X = train.drop('interest_level', 1)
X = X[features]
X.info()
Y = train['interest_level'].astype('category')
Y.dtype

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49352 entries, 10 to 99994
Data columns (total 12 columns):
description           49352 non-null int32
display_address       49352 non-null int16
created_day           49352 non-null int64
created_month         49352 non-null int64
num_photos            49352 non-null int64
num_features          49352 non-null int64
description_length    49352 non-null int64
longitude             49352 non-null float64
latitude              49352 non-null float64
price                 49352 non-null int64
bathrooms             49352 non-null float64
bedrooms              49352 non-null int64
dtypes: float64(3), int16(1), int32(1), int64(7)
memory usage: 4.4 MB


category

### Model Building 

- We are using KNN
- later, we can use different things 

In [13]:
# Train Test Split (not cross validation or K fold validation just yet)

In [31]:
from sklearn.cross_validation import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, log_loss

In [34]:
X_train.info()
# scikit learn won't allow for datetime objects 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37014 entries, 5852 to 38821
Data columns (total 12 columns):
description           37014 non-null int32
display_address       37014 non-null int16
created_day           37014 non-null int64
created_month         37014 non-null int64
num_photos            37014 non-null int64
num_features          37014 non-null int64
description_length    37014 non-null int64
longitude             37014 non-null float64
latitude              37014 non-null float64
price                 37014 non-null int64
bathrooms             37014 non-null float64
bedrooms              37014 non-null int64
dtypes: float64(3), int16(1), int32(1), int64(7)
memory usage: 3.3 MB


In [37]:
knn = KNeighborsClassifier(n_neighbors = 400, leaf_size = 200)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)

In [38]:
print metrics.accuracy_score(Y_test, Y_pred)
knn_prediction = knn.predict_proba(X_test)
ll = log_loss(Y_test, knn_prediction)
print ll

0.692251580483
0.743897365207


In [39]:
# apply this to the test 
index = test["listing_id"]
test = test[features]


In [40]:
test_predictions = knn.predict_proba(tester)

In [41]:
submission = pd.DataFrame({
        "listing_id": index,
        "high": test_predictions[:,0],
        "medium":test_predictions[:,2],
        "low":test_predictions[:,1]
    })
    
columnsTitles=["listing_id","high","medium","low"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('submission3.csv', index=False)

In [42]:
# it seems like we won't be able to get any further with KNN, lets try logistic regression 

In [41]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)

In [43]:
print metrics.accuracy_score(Y_test, Y_pred)
log_prediction = logreg.predict_proba(X_test)
ll = log_loss(Y_test, log_prediction)
print ll

0.691360025936
0.730751797363


In [48]:
test_predictions = logreg.predict_proba(tester)

In [49]:
submission = pd.DataFrame({
        "listing_id": index,
        "high": test_predictions[:,0],
        "medium":test_predictions[:,2],
        "low":test_predictions[:,1]
    })
    
columnsTitles=["listing_id","high","medium","low"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('submission4.csv', index=False)

In [44]:
# modelling using Gradient Boosting (gradient boosting model)

from sklearn.ensemble import GradientBoostingClassifier

In [51]:
# two most important parameters are number of trees and learning rate 
# but first, lets just make one simple classifer and see the result 

In [50]:
common_args = {'max_depth': 5, 'n_estimators': 300, 'subsample': 0.9, 'random_state': 2}

gbm = GradientBoostingClassifier(learning_rate=0.10, **common_args)

gbm.fit(X_train, Y_train)
Y_pred = gbm.predict(X_test)

In [51]:
print metrics.accuracy_score(Y_test, Y_pred)
gbm_prediction = gbm.predict_proba(X_test)
ll = log_loss(Y_test, gbm_prediction)
print ll

0.736099854109
0.601553665807


In [52]:
test_predictions = gbm.predict_proba(tester)

In [53]:
submission = pd.DataFrame({
        "listing_id": index,
        "high": test_predictions[:,0],
        "medium":test_predictions[:,2],
        "low":test_predictions[:,1]
    })
    
columnsTitles=["listing_id","high","medium","low"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('submission7.csv', index=False)

In [59]:
# use the large computer for this next time, it can do it faster 

In [54]:
# next ones


In [55]:
common_args = {'max_depth': 5, 'n_estimators': 500, 'subsample': 0.9, 'random_state': 2}

gbm = GradientBoostingClassifier(learning_rate=0.08, **common_args)

gbm.fit(X_train, Y_train)
Y_pred = gbm.predict(X_test)

print metrics.accuracy_score(Y_test, Y_pred)
gbm_prediction = gbm.predict_proba(X_test)
ll = log_loss(Y_test, gbm_prediction)
print ll

0.735937753283
0.602415747081


In [56]:
test_predictions = gbm.predict_proba(tester)

submission = pd.DataFrame({
        "listing_id": index,
        "high": test_predictions[:,0],
        "medium":test_predictions[:,2],
        "low":test_predictions[:,1]
    })
    
columnsTitles=["listing_id","high","medium","low"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('submission8.csv', index=False)

In [57]:
# change to longer gbm, higher number of estimators 

common_args = {'max_depth': 5, 'n_estimators': 700, 'subsample': 0.9, 'random_state': 2}

gbm = GradientBoostingClassifier(learning_rate=0.05, **common_args)

gbm.fit(X_train, Y_train)
Y_pred = gbm.predict(X_test)

print metrics.accuracy_score(Y_test, Y_pred)
gbm_prediction = gbm.predict_proba(X_test)
ll = log_loss(Y_test, gbm_prediction)
print ll

0.73707245907
0.600177252394


In [58]:
test_predictions = gbm.predict_proba(tester)

submission = pd.DataFrame({
        "listing_id": index,
        "high": test_predictions[:,0],
        "medium":test_predictions[:,2],
        "low":test_predictions[:,1]
    })
    
columnsTitles=["listing_id","high","medium","low"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('submission9.csv', index=False)

In [59]:
from sklearn.ensemble import RandomForestClassifier
# do random forest 

In [60]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, Y_train)
clf_prediction = clf.predict_proba(X_test)
print log_loss(Y_test, clf_prediction)

0.623556980046


In [61]:
test_predictions = clf.predict_proba(tester)

submission = pd.DataFrame({
        "listing_id": index,
        "high": test_predictions[:,0],
        "medium":test_predictions[:,2],
        "low":test_predictions[:,1]
    })
    
columnsTitles=["listing_id","high","medium","low"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('submission10.csv', index=False)

In [62]:
# use GBM again, however, this time, use the parameters that the other guy set 
# learning rate 0.01, tree size, log loss as the parameter 

# newly tuned GBM 
common_args = {'max_depth': 7, 'n_estimators': 2000, 'subsample': 0.7, 'random_state': 2}

gbm = GradientBoostingClassifier(learning_rate=0.01, **common_args)

gbm.fit(X_train, Y_train)
Y_pred = gbm.predict(X_test)

print metrics.accuracy_score(Y_test, Y_pred)
gbm_prediction = gbm.predict_proba(X_test)
ll = log_loss(Y_test, gbm_prediction)
print ll

0.73926082023
0.597806294449


In [63]:
test_predictions = gbm.predict_proba(tester)

submission = pd.DataFrame({
        "listing_id": index,
        "high": test_predictions[:,0],
        "medium":test_predictions[:,2],
        "low":test_predictions[:,1]
    })
    
columnsTitles=["listing_id","high","medium","low"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('submission11.csv', index=False)

In [64]:
# hmmm, the random forest was better still. Interesting 

In [1]:
# it seems like GBM has gotten to the best it can be 
# we would have to go to another model, maybe with XGBoost 
# now with GBM being at it's limit, maybe another model is better 

In [None]:
# how about linear gradient boosting, we have done random forest, and everything 