In [None]:
!python -m pip install xgboost --user --upgrade pip
!pip install lightgbm

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/05/45/978cd7e30860a5e2852500d55b7598d8f0806927ff30c86d01d01b28c43b/xgboost-1.4.0-py3-none-manylinux2010_x86_64.whl (166.7MB)
[K     |████████████████████████████████| 166.7MB 78kB/s 
[?25hCollecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 17.1MB/s 
Installing collected packages: xgboost, pip
Successfully installed pip-21.0.1 xgboost-1.4.0
Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import time
from itertools import combinations
import xgboost
import lightgbm as lgb
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn import combine

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_data = pd.read_csv("/content/drive/MyDrive/univ.ai/Training Data.csv")
test_data = pd.read_csv("/content/drive/MyDrive/univ.ai/Test Data.csv")
train_data.head()

for column in ["profession","city","state"]:
    test_data[column] = test_data[column].apply(lambda x : "_".join(x.strip().split(" ")))
test_data.head()

woe_state = pd.DataFrame(train_data.groupby('state')['risk_flag'].mean())
woe_state['woe_state'] = np.log(woe_state['risk_flag']/(1-woe_state['risk_flag']))
train_data['woe_state'] = train_data['state'].map(woe_state['woe_state'])
test_data['woe_state'] = test_data['state'].map(woe_state['woe_state'])

woe_profession = pd.DataFrame(train_data.groupby('profession')['risk_flag'].mean())
woe_profession['woe_profession'] = np.log(woe_profession['risk_flag']/(1-woe_profession['risk_flag']))
train_data['woe_profession'] = train_data['profession'].map(woe_profession['woe_profession'])
test_data['woe_profession'] = test_data['profession'].map(woe_profession['woe_profession'])

woe_city = pd.DataFrame(train_data.groupby('city')['risk_flag'].mean())
woe_city['woe_city'] = np.log(woe_city['risk_flag']/(1-woe_city['risk_flag']))
train_data['woe_city'] = train_data['city'].map(woe_city['woe_city'])
test_data['woe_city'] = test_data['city'].map(woe_city['woe_city'])
"""
woe_car = pd.DataFrame(train_data.groupby('car_ownership')['risk_flag'].mean())
woe_car['woe_car'] = np.log(woe_car['risk_flag']/(1-woe_car['risk_flag']))
train_data['woe_car'] = train_data['car_ownership'].map(woe_car['woe_car'])
test_data['woe_car'] = test_data['car_ownership'].map(woe_car['woe_car'])

woe_married = pd.DataFrame(train_data.groupby('married')['risk_flag'].mean())
woe_married['woe_married'] = np.log(woe_married['risk_flag']/(1-woe_married['risk_flag']))
train_data['woe_married'] = train_data['married'].map(woe_married['woe_married'])
test_data['woe_married'] = test_data['married'].map(woe_married['woe_married'])

woe_house = pd.DataFrame(train_data.groupby('house_ownership')['risk_flag'].mean())
woe_house['woe_house'] = np.log(woe_house['risk_flag']/(1-woe_house['risk_flag']))
train_data['woe_house'] = train_data['house_ownership'].map(woe_house['woe_house'])
test_data['woe_house'] = test_data['house_ownership'].map(woe_house['woe_house'])
"""
train_data['car_ownership'] = train_data['car_ownership'].apply(lambda x : 0 if x=='yes' else 1)
test_data['car_ownership'] = test_data['car_ownership'].apply(lambda x : 0 if x=='yes' else 1)

train_data['married'] = train_data['married'].apply(lambda x : 0 if x=='married' else 1)
test_data['married'] = test_data['married'].apply(lambda x : 0 if x=='married' else 1)

def house(x):
    if x == 'owned' : return 0 
    elif x == "norent_noown" : return 1
    else : return 2

train_data['house_ownership'] = train_data['house_ownership'].apply(lambda x : house(x))
test_data['house_ownership'] = test_data['house_ownership'].apply(lambda x : house(x))

train_data['income_by_city'] = train_data['income']/train_data['woe_city']
test_data['income_by_city'] = test_data['income']/test_data['woe_city']

train_data['job_stability'] = train_data['experience']/train_data['current_job_years']
test_data['job_stability'] = test_data['experience']/test_data['current_job_years']
"""
#train_data.drop(['experience','current_job_years'],axis=1,inplace=True)
#test_data.drop(['experience','current_job_years'],axis=1,inplace=True)
"""
train_data['woe_location'] = train_data['woe_state']+train_data['woe_city']
test_data['woe_location'] = test_data['woe_state']+test_data['woe_city']
#train_data.drop(['woe_state','woe_city'],axis=1,inplace=True)
#test_data.drop(['woe_state','woe_city'],axis=1,inplace=True)

train_data.drop(['profession','city','state'],axis=1,inplace=True)
test_data.drop(['profession','city','state'],axis=1,inplace=True)

train_data.set_index('Id',inplace=True)
test_data.set_index('id',inplace=True)

xtrain=train_data.drop("risk_flag",axis=1)
ytrain=train_data["risk_flag"]
public_test = test_data.copy()

# train_data.head()

scaler = StandardScaler()
for column in xtrain.columns:
    xtrain[column] = scaler.fit_transform(xtrain[column].to_numpy().reshape(-1,1))
    public_test[column] = scaler.transform(public_test[column].to_numpy().reshape(-1,1))

X_trainval, X_holdouttest, y_trainval, y_holdouttest = train_test_split(xtrain, ytrain, stratify=ytrain.values, test_size=0.05)

xtrain.head()

Unnamed: 0_level_0,income,age,experience,married,house_ownership,car_ownership,current_job_years,current_house_years,woe_state,woe_profession,woe_city,income_by_city,job_stability,woe_location
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,-1.283145,-1.579603,-1.180232,0.3372,0.281684,0.657129,-0.914131,0.716356,1.645552,-0.664572,0.138228,1.154686,-0.652134,0.612707
2,0.895457,-0.583343,-0.014067,0.3372,0.281684,0.657129,0.731036,0.716356,-0.500791,1.475106,0.156994,-0.751646,-0.554145,-0.017533
3,-0.349269,0.940347,-1.013637,-2.965599,0.281684,0.657129,-0.639936,-1.427981,2.212779,0.707728,0.317321,0.288362,-0.652134,0.935461
4,0.437526,-0.52474,-1.346827,0.3372,0.281684,-1.521772,-1.188325,0.001577,1.083532,1.475106,2.807817,-3.608672,-0.652134,2.70905
5,0.268128,-0.173119,0.152528,0.3372,0.281684,0.657129,-0.914131,1.431135,-1.129591,-0.390227,0.129897,-0.191985,1.699587,-0.229831


In [10]:
num_round = 300
lgb_params = {"objective":"binary", "metric":"auc", "scale_pos_weight":(len(y_trainval)-(sum(y_trainval)))/sum(y_trainval), "max_depth":25}
train_data = lgb.Dataset(X_trainval, label=y_trainval)
lgb_model = lgb.train(lgb_params, train_set = train_data, num_boost_round = num_round) # , valid_sets=[validation_data]
for threshold in range(1,10):
  y_trainval_preds = (lgb_model.predict(X_trainval) > threshold/10)*1
  y_holdouttest_preds = (lgb_model.predict(X_holdouttest) > threshold/10)*1
  print(threshold/10,roc_auc_score(y_trainval,y_trainval_preds), roc_auc_score(y_holdouttest,y_holdouttest_preds))

0.1 0.5069634300846853 0.50569551890235
0.2 0.5533997923354639 0.5470398481973435
0.3 0.6388075581546271 0.6250109473069625
0.4 0.7405672100560341 0.7186498321412933
0.5 0.8246544005621318 0.790637863085681
0.6 0.8613944790955916 0.8249788352065391
0.7 0.8462228779440882 0.8232666763976062
0.8 0.7535150047269971 0.7273507517150781
0.9 0.5834393901005104 0.5719500802802511


In [None]:
xtrain.corr()

Unnamed: 0,income,age,experience,married,house_ownership,car_ownership,current_job_years,current_house_years,woe_state,woe_profession,woe_city
income,1.0,-0.000652,0.006422,0.002682,0.006412,-0.004068,0.007045,-0.002397,0.01191,-0.015738,0.013843
age,-0.000652,1.0,-0.001118,-0.005324,-0.017551,-0.009395,0.002154,-0.020134,0.00766,-0.007255,-0.002466
experience,0.006422,-0.001118,1.0,-0.001752,-0.013346,-0.007519,0.646098,0.019309,-0.004069,0.00704,-0.020091
married,0.002682,-0.005324,-0.001752,1.0,-0.026208,-0.001206,0.004251,-0.007539,0.00243,0.00511,0.001871
house_ownership,0.006412,-0.017551,-0.013346,-0.026208,1.0,0.002167,-0.00939,-0.013786,0.015055,-0.009107,0.01039
car_ownership,-0.004068,-0.009395,-0.007519,-0.001206,0.002167,1.0,-0.011099,-0.000251,0.000477,0.014963,0.010435
current_job_years,0.007045,0.002154,0.646098,0.004251,-0.00939,-0.011099,1.0,0.005372,-0.003203,0.001004,-0.006963
current_house_years,-0.002397,-0.020134,0.019309,-0.007539,-0.013786,-0.000251,0.005372,1.0,0.007839,-0.01332,0.01945
woe_state,0.01191,0.00766,-0.004069,0.00243,0.015055,0.000477,-0.003203,0.007839,1.0,0.00656,0.370199
woe_profession,-0.015738,-0.007255,0.00704,0.00511,-0.009107,0.014963,0.001004,-0.01332,0.00656,1.0,0.026245


In [None]:
c = xtrain.corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort")
pd.set_option('display.max_rows', 500)
print(so[5890:-78])

woe_city*income_by_city                woe_city*income_by_city                  False
married*woe_city                       married*woe_city                         False
house_ownership*woe_city               house_ownership*woe_city                 False
house_ownership*woe_profession         house_ownership*woe_profession           False
house_ownership*woe_state              house_ownership*woe_state                False
                                                                                ...  
age                                    woe_state*woe_city                        True
current_house_years*woe_profession     income*current_job_years                  True
income*current_job_years               current_house_years*woe_profession        True
income*woe_profession                  current_job_years*current_house_years     True
current_job_years*current_house_years  income*woe_profession                     True
Length: 6084, dtype: bool

In [None]:
"""
poly = PolynomialFeatures(degree=2, interaction_only=True)
xtrain_poly = poly.fit_transform(xtrain)
X_trainval_poly, X_holdouttest_poly, y_trainval_poly, y_holdouttest_poly = train_test_split(xtrain_poly,ytrain,test_size=0.05)
public_test_poly = poly.transform(public_test)"""
X_trainval_pt1, X_trainval_pt2, y_trainval_pt1, y_trainval_pt2 = train_test_split(X_trainval, y_trainval, stratify = y_trainval, test_size = 0.5)
X_trainval_pt1.shape, y_trainval_pt1.shape

((119700, 11), (119700,))

In [None]:
#sm = SMOTE(random_state=69)
#X_new, y_new = sm.fit_sample(X_trainval,y_trainval)

#smotomek = combine.SMOTETomek(random_state=69,ratio)
#X_new, y_new= smotomek.fit_resample(X_trainval, y_trainval)

#X_new, y_new = ADASYN().fit_resample(X_trainval,y_trainval)
#X_new, y_new = BorderlineSMOTE().fit_resample(X_trainval,y_trainval) 0.9220853139259075 0.7420274412494525
X_new, y_new = SVMSMOTE().fit_resample(X_trainval_pt1,y_trainval_pt1)

In [None]:
#bal_x.shape, X_trainval.shape, X_new.shape
sum(y_new)/len(y_new), X_new.shape

In [11]:
n_estimators=150
gamma = 5 #0.5,1,3,5,10,20,100]:
max_depth = 30
min_child_weight = 5
subsample = 0.85 # [0.8,0.85,0.9,0.95,1]:
maj_to_min = 3*(len(y_trainval)-(sum(y_trainval)))/sum(y_trainval) # 1.0 # /sum(bal_y)/len(bal_y) #
colsample_bytree = 1.0
model = xgboost.sklearn.XGBClassifier(objective="binary:logistic", verbosity = 1, num_boost_rounds = 3000, booster='dart', reg_alpha=0.5, reg_lambda=0.5, learning_rate=0.01 , seed = 10, colsample_bytree = colsample_bytree, subsample=subsample, n_estimators=n_estimators, scale_pos_weight=maj_to_min, max_depth=max_depth, gamma=gamma, min_child_weight=min_child_weight)
model.fit(X_trainval, y_trainval)
for threshold in range(1,10):
  print(threshold/10, roc_auc_score(y_trainval,((model.predict_proba(X_trainval)[:,1])>threshold/10)*1 ), roc_auc_score(y_holdouttest,((model.predict_proba(X_holdouttest)[:,1])>threshold/10)*1 ))
#model.fit(X_new, y_new)
#print(roc_auc_score(y_trainval,model.predict(X_trainval)), roc_auc_score(y_holdouttest,model.predict(X_holdouttest)))
#print(roc_auc_score(y_new,model.predict(X_new)), roc_auc_score(y_holdouttest,model.predict(X_holdouttest.values)))
#0.9369266601255513 0.8595037220843673 public test 0.8680416597704652

0.1 0.5 0.5
0.2 0.892064452213342 0.8911020289008904
0.3 0.9057650723491812 0.8890249598598744
0.4 0.9280675767072788 0.8722186542110641
0.5 0.9462453680329976 0.8547044227120129
0.6 0.9544400201948998 0.8417311341410013
0.7 0.9568321619352707 0.8374719019121297
0.8 0.9005014312877329 0.8341716537731718
0.9 0.5 0.5


In [13]:
for threshold in range(12,30,2):
  print(threshold/100, roc_auc_score(y_trainval,((model.predict_proba(X_trainval)[:,1])>threshold/100)*1 ), roc_auc_score(y_holdouttest,((model.predict_proba(X_holdouttest)[:,1])>threshold/100)*1 ))

0.12 0.6048301056421883 0.5864705882352941
0.14 0.8512769463787306 0.8443891402714931
0.16 0.8814168817931547 0.88148737410597
0.18 0.8881516903702715 0.8880878703838855
0.2 0.892064452213342 0.8911020289008904
0.22 0.8939315278584833 0.8924025689680339
0.24 0.8962677538889471 0.8932958692161728
0.26 0.8990516970383988 0.8910991096190337
0.28 0.902135705916534 0.8912844840169318


In [14]:
for threshold in range(22,26,1):
  print(threshold/100, roc_auc_score(y_trainval,((model.predict_proba(X_trainval)[:,1])>threshold/100)*1 ), roc_auc_score(y_holdouttest,((model.predict_proba(X_holdouttest)[:,1])>threshold/100)*1 ))

0.22 0.8939315278584833 0.8924025689680339
0.23 0.8950317688636559 0.8933075463435995
0.24 0.8962677538889471 0.8932958692161728
0.25 0.8975442239728703 0.8918975332068313


In [15]:
for threshold in range(220,240,2):
  print(threshold/1000, roc_auc_score(y_trainval,((model.predict_proba(X_trainval)[:,1])>threshold/1000)*1 ), roc_auc_score(y_holdouttest,((model.predict_proba(X_holdouttest)[:,1])>threshold/1000)*1 ))

0.22 0.8939315278584833 0.8924025689680339
0.222 0.8941553864179772 0.8926740621807036
0.224 0.8942958933861703 0.8927645599182601
0.226 0.894557855530259 0.8930360531309298
0.228 0.8948293435704964 0.8930813019997081
0.23 0.8950317688636559 0.8933075463435995
0.232 0.895200853520295 0.8933527952123778
0.234 0.8954866304047554 0.893533790687491
0.236 0.8956533335873572 0.8936242884250475
0.238 0.8960319879592672 0.8938052839001607


In [16]:
for threshold in range(2380,2400,2):
  print(threshold/10000, roc_auc_score(y_trainval,((model.predict_proba(X_trainval)[:,1])>threshold/10000)*1 ), roc_auc_score(y_holdouttest,((model.predict_proba(X_holdouttest)[:,1])>threshold/10000)*1 ))

0.238 0.8960319879592672 0.8938052839001607
0.2382 0.8960510397515646 0.8938052839001607
0.2384 0.896070091543862 0.8938052839001607
0.2386 0.8960891433361593 0.8938052839001607
0.2388 0.8961105766024939 0.8938052839001607
0.239 0.896141535764977 0.8934827032549992
0.2392 0.8961653505053488 0.8934827032549992
0.2394 0.8961867837716833 0.8935279521237776
0.2396 0.8962082170380178 0.8936184498613341
0.2398 0.8962439391485755 0.8932958692161728


In [4]:
n_estimators=200
gamma = 5 #0.5,1,3,5,10,20,100]:
max_depth = 30
min_child_weight = 5
subsample = 0.85 # [0.8,0.85,0.9,0.95,1]:
maj_to_min = 3*(len(y_trainval)-(sum(y_trainval)))/sum(y_trainval) # 1.0 # /sum(bal_y)/len(bal_y) #
colsample_bytree = 1.0
model_3 = xgboost.sklearn.XGBClassifier(objective="binary:logistic", verbosity = 1, num_boost_rounds = 3000, booster='dart', reg_alpha=0.5, reg_lambda=0.5, learning_rate=0.01 , seed = 69, colsample_bytree = colsample_bytree, subsample=subsample, n_estimators=n_estimators, scale_pos_weight=maj_to_min, max_depth=max_depth, gamma=gamma, min_child_weight=min_child_weight)
model_3.fit(X_trainval, y_trainval, xgb_model="/content/drive/MyDrive/univ.ai/model_best")
for threshold in range(1,10):
  print(threshold/10, roc_auc_score(y_trainval,((model_3.predict_proba(X_trainval)[:,1])>threshold/10)*1 ), roc_auc_score(y_holdouttest,((model_3.predict_proba(X_holdouttest)[:,1])>threshold/10)*1 ))

0.1 0.8947340846090096 0.8916742081447964
0.2 0.9106518570734542 0.9042417165377318
0.3 0.9313611553006849 0.9201401255291198
0.4 0.9453999447498023 0.9341950080280251
0.5 0.9528872991226649 0.9427689388410452
0.6 0.9559141526239081 0.9409093562983507
0.7 0.9569622132570762 0.9124361407093855
0.8 0.9532849729364813 0.860251058239673
0.9 0.9061600743442008 0.8376689534374544


In [7]:
for threshold in range(40,60,2):
  print(threshold/100, roc_auc_score(y_trainval,((model_3.predict_proba(X_trainval)[:,1])>threshold/100)*1 ), roc_auc_score(y_holdouttest,((model_3.predict_proba(X_holdouttest)[:,1])>threshold/100)*1 ))

0.4 0.9454571001266945 0.934014012552912
0.42 0.9473360831420216 0.9368646912859436
0.44 0.949119807195862 0.9385841482995183
0.46 0.9505486916181639 0.9399810246679318
0.48 0.9518680282347562 0.9414625602101883
0.5 0.9528825361745906 0.9425368559334403
0.52 0.9537112891395259 0.9432944095752445
0.54 0.9544066795583794 0.9437293825718873
0.56 0.9550306257561181 0.9430214567216465
0.58 0.9555331167779608 0.9428623558604584


In [5]:
n_estimators=200
gamma = 5 #0.5,1,3,5,10,20,100]:
max_depth = 30
min_child_weight = 5
subsample = 0.85 # [0.8,0.85,0.9,0.95,1]:
maj_to_min = 3*(len(y_trainval)-(sum(y_trainval)))/sum(y_trainval) # 1.0 # /sum(bal_y)/len(bal_y) #
colsample_bytree = 1.0
model_3 = xgboost.sklearn.XGBClassifier(objective="binary:logistic", verbosity = 1, num_boost_rounds = 3000, booster='dart', reg_alpha=0.5, reg_lambda=0.5, learning_rate=0.01 , seed = 69, colsample_bytree = colsample_bytree, subsample=subsample, n_estimators=n_estimators, scale_pos_weight=maj_to_min, max_depth=max_depth, gamma=gamma, min_child_weight=min_child_weight)
model_3.load_model("/content/drive/MyDrive/univ.ai/model_3")

In [9]:
3/maj_to_min

0.14024976899701838

In [10]:
roc_auc_score(y_trainval,((model_3.predict_proba(X_trainval)[:,1])>3/maj_to_min)), roc_auc_score(y_holdouttest,((model_3.predict_proba(X_holdouttest)[:,1])>3/maj_to_min ))

(0.8993493812930451, 0.9017647058823529)

In [11]:
y_pred_public = ((model_3.predict_proba(public_test)[:,1])>3/maj_to_min)*1
pd.DataFrame({"risk_flag" : y_pred_public}, index=public_test.index).to_csv("prediction_v19.csv")

In [6]:
model_3.save_model("/content/drive/MyDrive/univ.ai/model_3")

In [12]:
model.save_model("/content/drive/MyDrive/univ.ai/model_1")

In [14]:
n_estimators=100
gamma = 7.5 #0.5,1,3,5,10,20,100]:
max_depth = 25
min_child_weight = 5
subsample = 0.85 # [0.8,0.85,0.9,0.95,1]:
maj_to_min = (len(y_trainval)-(sum(y_trainval)))/sum(y_trainval) # 1.0 # /sum(bal_y)/len(bal_y) #
colsample_bytree = 1.0
model_4 = xgboost.sklearn.XGBClassifier(objective="binary:logistic", verbosity = 1, num_boost_rounds = 2000, booster='dart', reg_alpha=0.5, reg_lambda=0.5, learning_rate=0.01 , seed = 69, colsample_bytree = colsample_bytree, subsample=subsample, n_estimators=n_estimators, scale_pos_weight=maj_to_min, max_depth=max_depth, gamma=gamma, min_child_weight=min_child_weight)
model_4.fit(X_trainval, y_trainval)#, xgb_model="/content/drive/MyDrive/univ.ai/model_best")
for threshold in range(1,10):
  print(threshold/10, roc_auc_score(y_trainval,((model_4.predict_proba(X_trainval)[:,1])>threshold/10)*1 ), roc_auc_score(y_holdouttest,((model_4.predict_proba(X_holdouttest)[:,1])>threshold/10)*1 ))
model_4.save_model("/content/drive/MyDrive/univ.ai/model_4")

0.1 0.5 0.5
0.2 0.6145346123436563 0.6053846153846154
0.3 0.8981967478590549 0.8930229163625748
0.4 0.9274933315475744 0.8703284192088746
0.5 0.9509573917392657 0.8453846153846154
0.6 0.9319626422394369 0.8398890672894468
0.7 0.8319650600106128 0.8200554663552767
0.8 0.5 0.5
0.9 0.5 0.5


In [16]:
for threshold in range(20,40,1):
  print(threshold/100, roc_auc_score(y_trainval,((model_4.predict_proba(X_trainval)[:,1])>threshold/100)*1 ), roc_auc_score(y_holdouttest,((model_4.predict_proba(X_holdouttest)[:,1])>threshold/100)*1 ))

0.2 0.6145346123436563 0.6053846153846154
0.21 0.8005682197052688 0.7894511750109474
0.22 0.8620674052411481 0.8573697270471465
0.23 0.8799117902016632 0.8791285943657859
0.24 0.8861393448088629 0.8858254269449715
0.25 0.890018766015413 0.8902598160852431
0.26 0.8923740438381741 0.8933250620347395
0.27 0.8936790916105433 0.8940432053714786
0.28 0.8950603465521019 0.8929674500072982
0.29 0.8966392638387457 0.8924011093271055
0.3 0.8981967478590549 0.8930229163625748
0.31 0.9001328862512741 0.8912961611443585
0.32 0.9023405126837307 0.8895183184936506
0.33 0.9047981938900902 0.8868632316450151
0.34 0.9076511997866199 0.8853568822069771
0.35 0.9105947016965621 0.8841161874178952
0.36 0.9138763729197824 0.8797401839147571
0.37 0.917246158682378 0.8764224200846593
0.38 0.9208064623679473 0.8720916654502991
0.39 0.9241854641099453 0.8714333673916217


In [18]:
y_pred_public = ((model_4.predict_proba(public_test)[:,1])>0.25)*1
pd.DataFrame({"risk_flag" : y_pred_public}, index=public_test.index).to_csv("prediction_v22.csv")

In [15]:
y_pred_public = ((model_4.predict_proba(public_test)[:,1])>0.3)*1
pd.DataFrame({"risk_flag" : y_pred_public}, index=public_test.index).to_csv("prediction_v20.csv")

In [None]:
# model.save_model("model_1")
model_2 = xgboost.sklearn.XGBClassifier(objective="binary:logistic", verbosity = 1, num_boost_rounds = 3000, booster='gbtree', reg_alpha=0.5, reg_lambda=0.5, learning_rate=0.01 , seed = 69, colsample_bytree = colsample_bytree, subsample=subsample, n_estimators=n_estimators, scale_pos_weight=maj_to_min, max_depth=max_depth, gamma=gamma, min_child_weight=min_child_weight)
model_2.fit(X_trainval, y_trainval,xgb_model="/content/model_1")
for threshold in range(1,10):
  print(threshold/10, roc_auc_score(y_trainval,((model_2.predict_proba(X_trainval)[:,1])>threshold/10)*1 ), roc_auc_score(y_holdouttest,((model_2.predict_proba(X_holdouttest)[:,1])>threshold/10)*1 ))

0.1 0.8940005906055613 0.8879462852138374
0.2 0.906979624108138 0.8823792147131806


KeyboardInterrupt: ignored

In [None]:
for threshold in range(2,20,2):
  print(threshold/100, roc_auc_score(y_trainval,((model_2.predict_proba(X_trainval)[:,1])>threshold/100)*1 ), roc_auc_score(y_holdouttest,((model_2.predict_proba(X_holdouttest)[:,1])>threshold/100)*1 ))

0.02 0.5 0.5
0.04 0.5721753336445126 0.5615384615384615
0.06 0.8680663383407794 0.8588235294117648
0.08 0.8923740438381741 0.8871828930083199
0.1 0.8940005906055613 0.8879462852138374
0.12 0.8954604341903465 0.8882513501678587
0.14 0.8976490088305058 0.8875492628813312
0.16 0.900285300589653 0.8863202452196759
0.18 0.9034526610590892 0.8833367391621663


In [None]:
y_pred_public = (lgb_model.predict(public_test) > 0.2)*1
pd.DataFrame({"risk_flag" : y_pred_public}, index=public_test.index).to_csv("prediction_v13.csv")

with woe_location ->  
  With job_stability -> 0.9301708598649377 0.8427134724857686  
  without job_stability -> 0.9324822597200537 0.8534476718727194  
without woe_location ->
0.9351826717335341 0.8515457597431032  


1000 -> (0.9554997218736595, 0.9137673332360239)

In [None]:
roc_auc_score(y_holdouttest,model.predict(X_holdouttest))

0.5751525324770107

In [None]:
y_pred_public = model.predict(public_test)
pd.DataFrame({"risk_flag" : y_pred_public}, index=public_test.index).to_csv("prediction_v12.csv")

In [None]:
ags_0 = np.random.choice(np.argwhere(y_trainval.values==0)[:,0],size=len((np.argwhere(y_trainval.values==1))))
ags_1 = np.argwhere(y_trainval.values==1)[:,0]
ags = np.concatenate((ags_0,ags_1))
print(ags.shape)
X_balanced = xtrain.iloc[ags]
y_balanced = ytrain.iloc[ags]

In [None]:
!nvidia-smi

Mon Apr 12 13:45:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import time 
maj_to_min = 1/(sum(y_trainval.values)/len(y_trainval.values))
#xg_train = xgb.DMatrix(X_trainval, y_trainval)
xgb = xgboost.sklearn.XGBClassifier(tree_method='gpu_hist', objective="binary:logistic", learning_rate=0.02, seed=69, n_estimators=75, scale_pos_weight=maj_to_min, max_depth=15, gamma=2, colsample_bytree=0.8, subsample=0.8, min_child_weight=5, n_gpus=1)
start_time=time.time()
xgb.fit(X_trainval, y_trainval)
print(time.time()-start_time)

n_gpus: 
	Deprecated. Single process multi-GPU training is no longer supported.
	Please switch to distributed training with one process per GPU.
	This can be done using Dask or Spark.  See documentation for details.
35.03085017204285


In [None]:
#DTClassifier= DecisionTreeClassifier(criterion='entropy', random_state=0)
#GBC = GradientBoostingClassifier(n_estimators=100)
#xgb = xgboost.sklearn.XGBClassifier(objective="binary:logistic", learning_rate=0.05, seed=9616, max_depth=20, gamma=10, n_estimators=500)
#rfc = RandomForestClassifier(criterion='entropy')
#gmm = GaussianMixture(n_components=11)
#model = xgb
import csv
xgb = xgboost.sklearn.XGBClassifier(objective="binary:logistic", tree_method= 'hist', learning_rate=0.03, seed=691)

params = {
        'min_child_weight': [1, 5, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [13, 15, 17],
        'alpha' : [0.01,0.03,0.1]
        }

folds = 3
param_comb = 3
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = GridSearchCV(xgb, param_grid=params, scoring='roc_auc', n_jobs=8, cv=skf.split(X_trainval,y_trainval), verbose=4) # n_iter=param_comb, random_state=69, param_distributions=params
random_search.fit(X_trainval, y_trainval)
pd.DataFrame(random_search.cv_results_).to_csv('/content/drive/MyDrive/univ.ai/results5.csv')
print('ROC AUC for {0} : {1}'.format(random_search.best_params_,roc_auc_score(y_holdouttest, random_search.predict(X_holdouttest))))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:  3.6min
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed: 25.5min
[Parallel(n_jobs=8)]: Done 205 tasks      | elapsed: 62.4min
[Parallel(n_jobs=8)]: Done 376 tasks      | elapsed: 115.8min
[Parallel(n_jobs=8)]: Done 597 tasks      | elapsed: 183.6min
[Parallel(n_jobs=8)]: Done 729 out of 729 | elapsed: 223.9min finished


In [None]:
#3.6min finished 269.2968921661377

In [None]:
'ROC AUC for {0} : {1}'.format(random_search.best_params_,roc_auc_score(y_holdouttest, random_search.predict(X_holdouttest)))

"ROC AUC for {'alpha': 0.03, 'colsample_bytree': 0.6, 'max_depth': 17, 'min_child_weight': 1, 'subsample': 1.0} : 0.7120428786228336"

In [None]:
new_model = xgboost.sklearn.XGBClassifier(objective="binary:logistic", n_estimators=300, scale_pos_weight = 1/(sum(y_trainval.values)/len(y_trainval.values)), learning_rate=0.03, alpha=0.01, max_depth=17, min_child_weight=2, subsample=0.8, gamma=5, colsample_bytree=0.6)
new_model.fit(X_balanced, y_balanced)
roc_auc_score(y_holdouttest,new_model.predict(X_holdouttest))

0.8609713605510564

In [None]:
0.8430542133383581

Unnamed: 0_level_0,income,age,experience,current_job_years,current_house_years,woe_state,woe_profession,woe_city
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7146,0.14468,0.178502,1.318694,1.55362,0.001577,-0.237796,0.750491,0.253327
226637,0.815763,-0.934964,1.485289,1.005231,0.001577,0.342734,0.80073,0.102336
68293,0.159483,-1.110775,0.652314,0.731036,1.431135,-1.129591,-0.825666,0.514398
138827,-0.71594,1.116158,0.152528,-0.091547,0.716356,-0.238556,1.737019,0.524524
247776,0.533503,1.057554,1.318694,-0.639936,-0.713202,-2.034242,-0.150803,-0.100535


In [None]:
y_pred_public = new_model.predict(public_test)
pd.DataFrame({"risk_flag" : y_pred_public}, index=public_test.index).to_csv("prediction_v8.csv")

In [None]:
print(time.time())

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
model = xgboost.sklearn.XGBClassifier(objective="binary:logistic", learning_rate=0.02, seed=69, n_estimators=300)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.5f' % np.mean(scores))

KeyboardInterrupt: ignored

In [None]:
maj_to_min = 1/(sum(y_trainval.values)/len(y_trainval.values))
from sklearn.model_selection import RepeatedStratifiedKFold
model1 = xgboost.sklearn.XGBClassifier(objective="binary:logistic", learning_rate=0.02, seed=69, n_estimators=300, scale_pos_weight=maj_to_min)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model1, X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.5f' % np.mean(scores))

Mean ROC AUC: 0.78946


In [None]:
maj_to_min = 1/(sum(y_trainval.values)/len(y_trainval.values))
from sklearn.model_selection import RepeatedStratifiedKFold
model1 = xgboost.sklearn.XGBClassifier(objective="binary:logistic", learning_rate=0.02, seed=69, n_estimators=500, scale_pos_weight=maj_to_min, max_depth=20, gamma=0.5)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model1, X_trainval, y_trainval, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.5f' % np.mean(scores))

Mean ROC AUC: 0.93542


In [None]:
n_estimators=75
for max_depth in range(11,20):
    model2 = xgboost.sklearn.XGBClassifier(objective="binary:logistic", learning_rate=0.01, seed=69, n_estimators=n_estimators, scale_pos_weight=maj_to_min, max_depth=max_depth, gamma=0.5)
    model2.fit(X_trainval, y_trainval)
    y_pred_holdouttest = model2.predict(X_holdouttest)
    print(max_depth,roc_auc_score(y_trainval,model2.predict(X_trainval)), roc_auc_score(y_holdouttest,y_pred_holdouttest))

11 0.8283840547611856 0.8005836740985928
12 0.8646812264785575 0.8274044208377906
13 0.885759227827598 0.8454247163182983
14 0.8988689101733024 0.8541859893570098
15 0.9242237325797087 0.857075760911948
16 0.9271647728252328 0.8537464841325884
17 0.9333928570327243 0.8594835938588083
18 0.939764315755346 0.8586703260906716
19 0.9434864155849543 0.8625469353912788


In [None]:
n_estimators=75
gamma = 5
max_depth = 13
min_child_weight = 5
for subsample in [0.8,0.85,0.9,0.95,1]:
    model2 = xgboost.sklearn.XGBClassifier(objective="binary:logistic", learning_rate=0.01, seed=69, subsample=subsample, n_estimators=n_estimators, scale_pos_weight=maj_to_min, max_depth=max_depth, gamma=gamma, min_child_weight=min_child_weight)
    model2.fit(X_trainval, y_trainval)
    y_pred_holdouttest = model2.predict(X_holdouttest)
    print(subsample,roc_auc_score(y_trainval,model2.predict(X_trainval)), roc_auc_score(y_holdouttest,y_pred_holdouttest))

0.8 0.904166161335265 0.8536559208288391
0.85 0.9031696565367839 0.8527129115369744
0.9 0.9041145389186432 0.8518072784994811
0.95 0.8967336183037982 0.8530557500257507
1 0.8822371630624875 0.8437158716433159


In [None]:
n_estimators=500
gamma = 5 #0.5,1,3,5,10,20,100]:
max_depth = 12
min_child_weight = 5
subsample=0.8 # [0.8,0.85,0.9,0.95,1]:
for nrounds in [1000]:
    
    model2 = xgboost.sklearn.XGBClassifier(objective="binary:logistic",verbosity=1,early_stopping_rounds=15, learning_rate=0.05, seed=69, subsample=subsample, n_estimators=n_estimators, scale_pos_weight=maj_to_min, max_depth=max_depth, gamma=gamma, min_child_weight=min_child_weight)
    model2.fit(X_trainval, y_trainval)
    y_pred_holdouttest = model2.predict(X_holdouttest)
    print(nrounds,roc_auc_score(y_trainval,model2.predict(X_trainval)), roc_auc_score(y_holdouttest,y_pred_holdouttest))

Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1000 0.9543444709866139 0.8474600275289463


In [None]:
print(random_search.best_estimator_)
pd.DataFrame(random_search.cv_results_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=15,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1,
              random_state=6969, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=6969, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_min_child_weight,param_max_depth,param_gamma,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,12.372567,0.020605,0.088335,0.019154,0.8,5,4,5,1.0,"{'subsample': 0.8, 'min_child_weight': 5, 'max...",0.795587,0.79191,0.791942,0.793147,0.001726,4
1,55.21257,7.915631,0.113335,0.00613,1.0,5,15,5,1.0,"{'subsample': 1.0, 'min_child_weight': 5, 'max...",0.906683,0.904897,0.904768,0.905449,0.000874,3
2,49.670498,2.538399,0.240399,0.054548,0.6,1,10,2,0.6,"{'subsample': 0.6, 'min_child_weight': 1, 'max...",0.919388,0.917016,0.916346,0.917583,0.001305,2
3,15.199532,0.233192,0.103666,0.01389,0.8,10,4,10,0.6,"{'subsample': 0.8, 'min_child_weight': 10, 'ma...",0.770463,0.767511,0.767282,0.768418,0.001448,5
4,54.379435,2.40465,0.324507,0.062538,0.8,5,15,2,0.8,"{'subsample': 0.8, 'min_child_weight': 5, 'max...",0.925749,0.923222,0.923464,0.924145,0.001138,1


In [None]:
print(random_search.best_estimator_)
pd.DataFrame(random_search.cv_results_).to_csv("results1.csv")

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1,
              random_state=6969, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=6969, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)


In [None]:
y_pred_public = model1.predict(public_test)
pd.DataFrame({"risk_flag" : y_pred_public}, index=public_test.index).to_csv("prediction_v6.csv")

NameError: ignored

In [None]:
12.661563873291016