In [18]:
import numpy as np
import pandas as pd
import math
import time
import os

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as ply
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error
from imblearn.over_sampling import SMOTE
import xgboost
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

from __future__ import division

import warnings

In [19]:
housing = pd.read_csv('../datasets/housing_sales.csv')
housing.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
attrition.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [5]:
categorical = []
for col, value in attrition.items():
    if value.dtypes == 'object':
        categorical.append(col)

numerical = attrition.columns.difference(categorical)

In [6]:
attrition_categorical = attrition[categorical]
attrition_categorical = attrition_categorical.drop('Attrition', axis=1)
attrition_categorical = pd.get_dummies(attrition_categorical)

In [7]:
attrition_numerical = attrition[numerical]

In [8]:
attrition_final = pd.concat([attrition_categorical, attrition_numerical], axis=1)

In [9]:
target_map = {'Yes': 1, 'No': 0}
target = attrition['Attrition'].apply(lambda x: target_map[x])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(attrition_final, target, train_size=0.75, random_state=0)

In [11]:
gb_classifier = GradientBoostingClassifier(random_state=100)
gb_classifier.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 100,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [12]:
gb_classifier.fit(X_train, y_train)
gb_prediction = gb_classifier.predict(X_test)
gb_prediction_prob = gb_classifier.predict_proba(X_test)

In [13]:
accuracy_score(y_test, gb_prediction)

0.8858695652173914

In [14]:
gb_classifier.feature_importances_

array([0.00119296, 0.02512782, 0.        , 0.        , 0.00334557,
       0.00150605, 0.0042737 , 0.0005249 , 0.00822779, 0.00420823,
       0.0016531 , 0.00431142, 0.00143818, 0.00200648, 0.00121754,
       0.00014525, 0.01096076, 0.        , 0.00334464, 0.        ,
       0.00532146, 0.01111343, 0.01076333, 0.00311429, 0.00155062,
       0.01744231, 0.        , 0.03836406, 0.06523565, 0.06658664,
       0.05641415, 0.03057373, 0.00399534, 0.        , 0.03934346,
       0.0343927 , 0.01652097, 0.03338705, 0.03211924, 0.02440121,
       0.10493303, 0.02377928, 0.03068504, 0.02042341, 0.        ,
       0.01282365, 0.        , 0.04186334, 0.04691095, 0.00821812,
       0.02921662, 0.02401328, 0.01001332, 0.02497884, 0.05801712])