Here we are using dataset that contains the information about individuals from various countries. Our target is to predict whether a person makes <=50k or >50k annually on basis of the other information available. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('adult.csv')
print(df.shape)
print(df.columns)

(32560, 15)
Index(['39', ' State-gov', ' 77516', ' Bachelors', ' 13', ' Never-married',
       ' Adm-clerical', ' Not-in-family', ' White', ' Male', ' 2174', ' 0',
       ' 40', ' United-States', ' <=50K'],
      dtype='object')


In [3]:
print(df.head(5))

   39          State-gov   77516   Bachelors   13        Never-married  \
0  50   Self-emp-not-inc   83311   Bachelors   13   Married-civ-spouse   
1  38            Private  215646     HS-grad    9             Divorced   
2  53            Private  234721        11th    7   Married-civ-spouse   
3  28            Private  338409   Bachelors   13   Married-civ-spouse   
4  37            Private  284582     Masters   14   Married-civ-spouse   

         Adm-clerical   Not-in-family   White     Male   2174   0   40  \
0     Exec-managerial         Husband   White     Male      0   0   13   
1   Handlers-cleaners   Not-in-family   White     Male      0   0   40   
2   Handlers-cleaners         Husband   Black     Male      0   0   40   
3      Prof-specialty            Wife   Black   Female      0   0   40   
4     Exec-managerial            Wife   White   Female      0   0   40   

    United-States   <=50K  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K 

In [4]:
df.columns = ['age','workclass','fnlwgt','education','education-num','marital_Status','occupation','relationship','race',
              'sex','capital_gain','capital_loss','hours_per_week','native_country','Income'] 
#print(df.head(5))

In [5]:
print(df.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital_Status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
Income            0
dtype: int64


In [6]:
cols = ['age','workclass','fnlwgt','education','education-num','marital_Status','occupation','relationship','race',
              'sex','capital_gain','capital_loss','hours_per_week','native_country','Income']
cols_categorical = ['age','workclass','education','education-num','marital_Status','occupation','relationship','race',
              'sex','capital_gain','capital_loss','hours_per_week','native_country']
cols_drop = ['Income']

In [7]:
target_series = df['Income']
df.drop(cols_drop,1,inplace=True)
print(type(df))
print(type(target_series))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [8]:
# print(pd.value_counts(target_series))
le = LabelEncoder()
le.fit(target_series)
target_series = le.transform(target_series)
print(pd.value_counts(target_series))

0    24719
1     7841
dtype: int64


In [9]:
#One Hot Encoding of the Categorical features 
one_hot_workclass=pd.get_dummies(df['workclass']) 
one_hot_education=pd.get_dummies(df['education']) 
one_hot_marital_Status=pd.get_dummies(df['marital_Status']) 
one_hot_occupation=pd.get_dummies(df['occupation'])
one_hot_relationship=pd.get_dummies(df['relationship']) 
one_hot_race=pd.get_dummies(df['race']) 
one_hot_sex=pd.get_dummies(df['sex']) 
one_hot_native_country=pd.get_dummies(df['native_country']) 

In [10]:
#removing categorical features 
df.drop(['workclass','education','marital_Status','occupation','relationship','race','sex','native_country']
          ,axis=1,inplace=True) 

In [11]:
#Merging one hot encoded features with our dataset 'data' 
df=pd.concat([df,one_hot_workclass,one_hot_education,one_hot_marital_Status,
                one_hot_occupation,one_hot_relationship,one_hot_race,one_hot_sex,one_hot_native_country],axis=1) 

In [12]:
print(df.head())

   age  fnlwgt  education-num  capital_gain  capital_loss  hours_per_week   ?  \
0   50   83311             13             0             0              13   0   
1   38  215646              9             0             0              40   0   
2   53  234721              7             0             0              40   0   
3   28  338409             13             0             0              40   0   
4   37  284582             14             0             0              40   0   

    Federal-gov   Local-gov   Never-worked     ...        Portugal  \
0             0           0              0     ...               0   
1             0           0              0     ...               0   
2             0           0              0     ...               0   
3             0           0              0     ...               0   
4             0           0              0     ...               0   

    Puerto-Rico   Scotland   South   Taiwan   Thailand   Trinadad&Tobago  \
0             0 

In [13]:
print(df.columns)
print(df.columns.unique())

Index(['age', 'fnlwgt', 'education-num', 'capital_gain', 'capital_loss',
       'hours_per_week', ' ?', ' Federal-gov', ' Local-gov', ' Never-worked',
       ...
       ' Portugal', ' Puerto-Rico', ' Scotland', ' South', ' Taiwan',
       ' Thailand', ' Trinadad&Tobago', ' United-States', ' Vietnam',
       ' Yugoslavia'],
      dtype='object', length=108)
Index(['age', 'fnlwgt', 'education-num', 'capital_gain', 'capital_loss',
       'hours_per_week', ' ?', ' Federal-gov', ' Local-gov', ' Never-worked',
       ...
       ' Portugal', ' Puerto-Rico', ' Scotland', ' South', ' Taiwan',
       ' Thailand', ' Trinadad&Tobago', ' United-States', ' Vietnam',
       ' Yugoslavia'],
      dtype='object', length=106)


In [14]:
#removing dulpicate columns 
_, i = np.unique(df.columns, return_index=True) 
df = df.iloc[:, i] 

In [15]:
X = df
y = target_series

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3)

# Applying xgboost 

In [17]:
import xgboost as xgb



In [19]:
#The data is stored in a DMatrix object 
#label is used to define our outcome variable
dtrain=xgb.DMatrix(X_train,label=y_train)
dtest=xgb.DMatrix(X_test)

In [21]:
#setting parameters for xgboost
parameters={'max_depth':7, 
            'eta':1, 
            'silent':1,
            'objective':'binary:logistic',
            'eval_metric':'auc',
            'learning_rate':.05
           }

In [23]:
#training our model 
num_round=50
from datetime import datetime 
start = datetime.now() 
xg=xgb.train(parameters,dtrain,num_round) 
stop = datetime.now()

In [25]:
#Execution time of the model 
execution_time_xgb = stop-start 
execution_time_xgb

datetime.timedelta(0, 2, 705658)

In [27]:
#datetime.timedelta( , , ) representation => (days , seconds , microseconds) 
#now predicting our model on test set 
ypred=xg.predict(dtest) 
ypred

array([ 0.0429053 ,  0.04668319,  0.09762883, ...,  0.95759577,
        0.18473722,  0.75826865], dtype=float32)

In [33]:
#Converting probabilities into 1 or 0  
for i in range(0,9768): 
    if ypred[i]>=0.5:       # setting threshold to .5 
       ypred[i]=1 
    else: 
       ypred[i]=0  

In [36]:
#calculating accuracy of our model 
from sklearn.metrics import accuracy_score 
accuracy_xgb = accuracy_score(y_test,ypred) 
accuracy_xgb

0.86128173628173632

# Applying lightgbm 

In [38]:
import lightgbm as lgb

In [40]:
train_data = lgb.Dataset(X_train,label=y_train)

In [41]:
#setting parameters for lightgbm
param = {'num_leaves':150,
         'objective':'binary',
         'max_depth':7,
         'learning_rate':.05,
         'max_bin':200
        }
param['metric'] = ['auc', 'binary_logloss']

In [44]:
#Here we have set max_depth in xgb and LightGBM to 7 to have a fair comparison between the two.

In [46]:
#training our model using light gbm
num_round=50
start=datetime.now()
lgbm=lgb.train(param,train_data,num_round)
stop=datetime.now()

In [48]:
#Execution time of the model
execution_time_lgbm = stop-start
execution_time_lgbm

datetime.timedelta(0, 0, 428140)

In [51]:
#predicting on test set
ypred2=lgbm.predict(X_test)
ypred2[0:5]  # showing first 5 predictions

array([ 0.04152566,  0.04773206,  0.09749417,  0.570988  ,  0.04152566])

In [54]:
#converting probabilities into 0 or 1
for i in range(0,9768):
    if ypred2[i]>=.5:       # setting threshold to .5
       ypred2[i]=1
    else:  
       ypred2[i]=0

In [66]:
#calculating accuracy
accuracy_lgbm = accuracy_score(ypred2,y_test)
accuracy_lgbm

0.86066748566748563

In [58]:
from sklearn.metrics import roc_auc_score

In [60]:
#calculating roc_auc_score for xgboost
auc_xgb =  roc_auc_score(y_test,ypred)
auc_xgb

0.76309181363764977

In [70]:
#calculating roc_auc_score for light gbm. 
auc_lgbm = roc_auc_score(y_test,ypred2)
auc_lgbm 

0.76212011839900173

In [71]:
comparison_dict = {
                    'accuracy score':(accuracy_lgbm,accuracy_xgb),
                    'auc score':(auc_lgbm,auc_xgb),
                    'execution time':(execution_time_lgbm,execution_time_xgb)
                  }

In [72]:
#Creating a dataframe ‘comparison_df’ for comparing the performance of Lightgbm and xgb. 
comparison_df = pd.DataFrame(comparison_dict) 
comparison_df.index= ['LightGBM','xgboost'] 
comparison_df

Unnamed: 0,accuracy score,auc score,execution time
LightGBM,0.860667,0.76212,00:00:00.428140
xgboost,0.861282,0.763092,00:00:02.705658
