In [1]:
import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
import random

In [7]:
processed_data=pd.read_json("../data/processed_train.json")
img=pd.read_csv("../data/image_stats-fixed.csv",index_col=0)
#processed_data=processed_data.merge(img,how="left",on="listing_id")

In [8]:
processed_data=processed_data.fillna(0)

In [9]:
random.seed(0)
train_data=processed_data.sample(n=processed_data.shape[0]*7/10)
test_data=processed_data.drop(train_data.index)

In [10]:
processed_data.columns

Index([u'By Owner', u'Cats Allowed', u'Common Outdoor Space', u'Dogs Allowed',
       u'Doorman', u'Elevator', u'Exclusive', u'Fitness Center', u'Furnished',
       u'Laundry In Building', u'Laundry In Unit', u'No Fee', u'Parking Space',
       u'Private Outdoor Space', u'Reduced Fee', u'Short Term Allowed',
       u'Storage Facility', u'Sublet / Lease-Break', u'avg_brightness',
       u'avg_imagesize', u'avg_luminance', u'bathrooms', u'bedrooms',
       u'building_id', u'created', u'description', u'description_sentiment',
       u'display_address', u'dist_count', u'features', u'img_quantity',
       u'interest_level', u'latitude', u'listing_id', u'longitude',
       u'manager count', u'manager_id', u'photos', u'price', u'street_address',
       u'unique_count', u'word_count'],
      dtype='object')

In [17]:
##########################first try with all numeric features in datasets##########################################
train=train_data.drop(['building_id','created','description','display_address','longitude','latitude','manager count','manager_id','listing_id','photos','street_address','features'],axis=1)
test=test_data.drop(['building_id','created','description','display_address','manager count','longitude','latitude','manager_id','listing_id','photos','street_address','features'],axis=1)

In [9]:
#base model 1: multinomial logistic regression
#base model 2: bagged decision trees
#base model 3: Random Forest trees
#base model 4: SVM
#base model 5: bayes classifier
#base model 6: Ada Boosting
y_train=train.loc[:,'interest_level']
x_train=train.drop('interest_level',axis=1)
y_test=test.loc[:,'interest_level']
x_test=test.drop('interest_level',axis=1)

In [10]:
#base model 1: multinomial logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
def mlog(x_train,y_train,x_test):
    lr = LogisticRegression().fit(x_train, y_train)
    return lr.predict(x_test)

In [140]:
#base model 2: bagged decision trees
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
def bagDT(x_train,y_train,x_test,y_test):
    kfold = model_selection.KFold(n_splits=10)
    cart = DecisionTreeClassifier()
    num_trees = 100
    model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees)
    model.fit(x_train,y_train)
    predict = model_selection.cross_val_predict(model, x_test, y_test, cv=kfold)
    return predict

In [141]:
#base model 3: Random Forest trees
from sklearn.ensemble import RandomForestClassifier
seed = 7
def rfClassifier(x_train,y_train,x_test,y_test):
    num_trees = 100
    max_features = 3
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
    model.fit(x_train,y_train)
    predicted = model_selection.cross_val_predict(model, x_test, y_test, cv=kfold)
    return predicted

In [142]:
#base model 4: SVM
#use cross validation to select cost, cost(default)=1
from sklearn import svm
def svmm(x_train,y_train,x_test):
    clf = svm.SVC(decision_function_shape='ovr')
    clf.fit(x_train, y_train) 
    return clf.predict(x_test)

In [143]:
#accuracy_score(svmm(x_train,y_train,x_test),test_data.loc[:,'interest_level'])

In [144]:
#base model 5: Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
def mnb(x_train,y_train,x_test):
    gnb=GaussianNB()
    y_pred = gnb.fit(x_train, y_train).predict(x_test)
    return y_pred

In [145]:
#base model 6: Ada boosting classifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
def adaBC(x_train,y_train,x_test):
    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(x_train,y_train)
    return clf.predict(x_test)

In [146]:
###############################now we ensemble them ###########################################################
#1st: partition the train set into 6 test sets
k=x_train.shape[0]/5
x_sp=[[],[],[],[],[]]
for i in range(4):
    sample=random.sample(x_train.index,k)
    x_sp[i]=x_train.ix[sample]
    x_train=x_train.drop(sample)
x_sp[4]=x_train

In [147]:
#2nd: create train_meta and test_meta
train_meta=pd.DataFrame()
train_meta

In [148]:
#3rd: for each fold in 1st, use other 5 folds as training set to predict the result for that fold.
#and save them in train_meta
x_train=train.drop('interest_level',axis=1)
for i in range(5):
    x_sub_test=x_sp[i]
    x_sub_train=x_train.drop(x_sub_test.index)
    y_sub_test=y_train[x_sub_test.index]
    y_sub_train=y_train[x_sub_train.index]
    M1=pd.Series(mlog(x_sub_train,y_sub_train,x_sub_test),index=x_sub_test.index)
    M2=pd.Series(bagDT(x_sub_train,y_sub_train,x_sub_test,y_sub_test),index=x_sub_test.index)
    M3=pd.Series(rfClassifier(x_sub_train,y_sub_train,x_sub_test,y_sub_test),index=x_sub_test.index)
    M4=pd.Series(svmm(x_sub_train,y_sub_train,x_sub_test),index=x_sub_test.index)
    M5=pd.Series(mnb(x_sub_train,y_sub_train,x_sub_test),index=x_sub_test.index)
    M6=pd.Series(adaBC(x_sub_train,y_sub_train,x_sub_test),index=x_sub_test.index)
    app={'M1':M1,'M2':M2,'M3':M3, 'M4':M4, 'M5':M5,'M6':M6}
    train_meta=train_meta.append(pd.DataFrame(app))

In [149]:
#4th:Fit each base model to the full training dataset 
#and make predictions on the test dataset. Store these predictions inside test_meta
M1=pd.Series(mlog(x_train,y_train,x_test),index=x_test.index)
M2=pd.Series(bagDT(x_train,y_train,x_test,y_test),index=x_test.index)
M3=pd.Series(rfClassifier(x_train,y_train,x_test,y_test),index=x_test.index)
M4=pd.Series(svmm(x_train,y_train,x_test),index=x_test.index)
M5=pd.Series(mnb(x_train,y_train,x_test),index=x_test.index)
M6=pd.Series(adaBC(x_train,y_train,x_test),index=x_test.index)
test['M1']=M1
test['M2']=M2
test['M3']=M3
test['M4']=M4
test['M5']=M5
test['M6']=M6

In [150]:
#5th: Fit a new model, S (i.e the stacking model) to train_meta, using M1 and M2 as features.
#Optionally, include other features from the original training dataset or engineered features
##==> transfer to dummy variables
train_meta_dummy=pd.get_dummies(train_meta)
test_meta=test.loc[:,['M1','M2','M3','M4','M5','M6']]
test_meta_dummy=pd.get_dummies(test_meta)

In [151]:
#random forest with meta only
res=rfClassifier(train_meta_dummy,y_train,test_meta_dummy,y_test)
print accuracy_score(res,y_test)
pd.Series(res).value_counts()

0.707078211536


low       13024
medium     1234
high        548
dtype: int64

In [128]:
####concate meta to original dataset and train it again########

In [152]:
x_last_train=pd.concat([train_meta_dummy,x_train],axis=1)
x_last_test=pd.concat([test_meta_dummy,x_test],axis=1)

In [153]:
#random forest with combined
res=rfClassifier(x_last_train,y_train,x_last_test,y_test)
print accuracy_score(res,y_test)
pd.Series(res).value_counts()

0.718357422666


low       11999
medium     2027
high        780
dtype: int64

In [162]:
train.to_json("train_data.json")

In [164]:
test.to_json("test_data.json")

In [165]:
x_last_train.to_json('train_meta_data.json')
x_last_test.to_json('test_meta_data.json')