In [102]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

### Segédfüggvények

In [103]:
def limitalt_dummy_valtozo(indf, oszlop, limit, drop_last=True):
    uj_valtozok = []
    df = indf.copy()
    egyedi_elemszam=df[oszlop].nunique()
    if egyedi_elemszam <= limit and drop_last is True:
        egyedi_elemszam = egyedi_elemszam - 1
    tmp = df[oszlop].value_counts()[:limit]
    for value in list(tmp.index):
        def f(x):
            if x == value:
                return 1
            else:
                return 0
        df[oszlop+'='+str(value)] = df[oszlop].apply(f)
        uj_valtozok.append(oszlop + '=' + str(value))
    return df, uj_valtozok

In [104]:
def myxval(model,train_df,bemeno_valtozok,fold_num=3):
    train_df['xval']=train_df['ID']%fold_num
    auc_list=[]
    for i in range(fold_num):
        mini_train=train_df[train_df['xval']!=i].copy()
        mini_test=train_df[train_df['xval']==i].copy()
        model.fit(mini_train[bemeno_valtozok],mini_train['TARGET'])
        mini_test['p1']=model.predict_proba(mini_test[bemeno_valtozok])[:,1]
        auc=roc_auc_score(mini_test['TARGET'],mini_test['p1'])
        auc_list.append(auc)
    return np.mean(auc_list)

In [105]:
from sklearn.metrics import roc_auc_score

def myxval_multi(models,train_df,bemeno_valtozok,fold_num=3):
    train_df['xval']=train_df['ID']%fold_num
    auc_list=[]
    for i in range(fold_num):
        mini_train=train_df[train_df['xval']!=i].copy()
        mini_test=train_df[train_df['xval']==i].copy()
        predict_list = None
        for model in models:
            model.fit(mini_train[bemeno_valtozok],mini_train['TARGET'])
            predictions=model.predict_proba(mini_test[bemeno_valtozok])[:,1]
            predictions = predictions.reshape((1, len(predictions)))
            if predict_list is None:
                predict_list = predictions
            else:
                predict_list = np.concatenate([predict_list, predictions], axis=0)
        mini_test['p1'] = np.mean(predict_list, axis=0)
        auc=roc_auc_score(mini_test['TARGET'],mini_test['p1'])
        auc_list.append(auc)
    return np.mean(auc_list)

### --------------------------------------------------------------------- ###

In [106]:
train_data = pd.read_csv('./data/public_train.csv')
test_data = pd.read_csv('./data/public_test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [107]:
# Train or test?
train_data['train_or_test'] = 'train'
test_data['train_or_test'] = 'test'

In [108]:
df = pd.concat([train_data, test_data], sort=True)

# TO_OBJECT

In [109]:
object_valtozok = ['NAT', 'M_STATUS', 'MATE_PROF_CODE', 'VISA_FLAG', 'MCARD_FLAG', 'S_FLAG',
       'AME_FLAG', 'OCARDS_FLAG', 'CARS_FLAG', 'PROF_CODE', 'OCC_TYPE', 'C_PRODUCT']

In [110]:
for att in object_valtozok:
    df[att] = df[att].astype('O')

In [111]:
df

Unnamed: 0,AGE,AME_FLAG,APP_SUB_TYPE,BILL_AMOUNT_MONTH_1,BILL_AMOUNT_MONTH_2,BILL_AMOUNT_MONTH_3,BILL_AMOUNT_MONTH_4,BILL_AMOUNT_MONTH_5,BILL_AMOUNT_MONTH_6,BIRTH_CITY,...,RES_CITY,RES_PHONE_AREA_CODE,RES_STATE,RES_TYPE,RES_ZIP,SEX,S_FLAG,TARGET,VISA_FLAG,train_or_test
0,32,0,W,4181,25988,3177,49174,11823,669,City0,...,City2115,105,State0,1.0,595,female,0,1.0,1,train
1,34,0,C,5100,9861,11021,16883,150,4661,City1,...,City1,20,State1,1.0,230,female,0,1.0,0,train
2,61,0,W,97989,101458,103840,105646,86127,67269,City3,...,City3,,State2,,545,female,0,0.0,0,train
3,36,0,W,44970,48101,46993,47940,50903,49794,City9,...,City16,107,State7,1.0,607,female,0,1.0,0,train
4,29,0,W,390,1780,0,0,0,0,City10,...,City10,32,State3,1.0,384,female,0,0.0,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,59,0,W,128671,129901,127658,127362,128726,129081,,...,City8,66,State21,1.0,725,female,0,,0,test
24996,38,0,W,70381,68660,70081,70899,75331,74005,,...,City239,112,State7,1.0,635,female,0,,0,test
24997,37,0,Z,5988,0,0,0,0,0,City32,...,,105,State0,2.0,591,female,0,,0,test
24998,68,0,W,6954,4110,1030,357,357,370,City1442,...,City459,,State5,1.0,882,male,0,,0,test


# Hiányzó adatok

In [112]:
# Hianyzo adatok kitoltese
for att in df.columns:
    if att=="TARGET":
        pass
    #amik nem számok (string értékek stb...)
    elif df[att].dtype=="object": 
        #print(att)
        df[att]=df[att].fillna("nincs_adat")
    else:
        #test és train közös mediánja
        #ha van benne NaN, akkor azzal töltötte volna fel, de ez nem jó, ezért kell a nanmedian függvény
        #de nem a mediánra kéne itt kitölteni az fix!!!
        #df[att]=df[att].fillna(0) ez is lehetne
        df[att]=df[att].fillna(0)

In [113]:
bemeno_valtozok = ['AGE', 'AME_FLAG', 'APP_SUB_TYPE', 'BILL_AMOUNT_MONTH_1',
       'BILL_AMOUNT_MONTH_2', 'BILL_AMOUNT_MONTH_3', 'BILL_AMOUNT_MONTH_4',
       'BILL_AMOUNT_MONTH_5', 'BILL_AMOUNT_MONTH_6', 'CARS_FLAG', 'C_PRODUCT', 'DAY_OF_PAYMENT', 'EDU_LEVEL',
       'EMAIL_FLAG', 'HOME_PHONE_FLAG', 'L_BALANCE', 'MATE_PROF_CODE',
       'MCARD_FLAG', 'MONTHS_IN_RES', 'M_IN_THE_JOB', 'M_STATUS', 'NAT',
       'NUMBERO_OF_BANK_ACCOUNTS', 'NUMBER_OF_DEPENDANTS',
       'NUMBER_OF_SPECIAL_BANK_ACCOUNTS', 'OCARDS_FLAG', 'OCC_TYPE',
       'O_INCOMES', 'PAY_AMOUNT_MONTH_1', 'PAY_AMOUNT_MONTH_2',
       'PAY_AMOUNT_MONTH_3', 'PAY_AMOUNT_MONTH_4', 'PAY_AMOUNT_MONTH_5',
       'PAY_AMOUNT_MONTH_6', 'PAY_MONTH_0', 'PAY_MONTH_2', 'PAY_MONTH_3',
       'PAY_MONTH_4', 'PAY_MONTH_5', 'PAY_MONTH_6', 'P_ADDRESS_TYPE', 'P_ASSETS_VALUE',
       'P_MONTHLY_INCOME', 'RES_TYPE', 'SEX', 'S_FLAG', 
       'VISA_FLAG']

In [114]:
object_valtozok = []
uj_valtozok = []
for att in bemeno_valtozok:
    if df[att].dtype == 'object':
        df, val = limitalt_dummy_valtozo(df, att, 6)
        object_valtozok.append(att)
        uj_valtozok.extend(val)

In [115]:
bemeno_valtozok = bemeno_valtozok + uj_valtozok
bemeno_valtozok = [val for val in bemeno_valtozok if val not in object_valtozok]

In [116]:
bemeno_valtozok

['AGE',
 'AME_FLAG',
 'BILL_AMOUNT_MONTH_1',
 'BILL_AMOUNT_MONTH_2',
 'BILL_AMOUNT_MONTH_3',
 'BILL_AMOUNT_MONTH_4',
 'BILL_AMOUNT_MONTH_5',
 'BILL_AMOUNT_MONTH_6',
 'CARS_FLAG',
 'C_PRODUCT',
 'DAY_OF_PAYMENT',
 'EDU_LEVEL',
 'EMAIL_FLAG',
 'L_BALANCE',
 'MCARD_FLAG',
 'MONTHS_IN_RES',
 'M_IN_THE_JOB',
 'M_STATUS',
 'NAT',
 'NUMBERO_OF_BANK_ACCOUNTS',
 'NUMBER_OF_DEPENDANTS',
 'NUMBER_OF_SPECIAL_BANK_ACCOUNTS',
 'OCARDS_FLAG',
 'O_INCOMES',
 'PAY_AMOUNT_MONTH_1',
 'PAY_AMOUNT_MONTH_2',
 'PAY_AMOUNT_MONTH_3',
 'PAY_AMOUNT_MONTH_4',
 'PAY_AMOUNT_MONTH_5',
 'PAY_AMOUNT_MONTH_6',
 'PAY_MONTH_0',
 'PAY_MONTH_2',
 'PAY_MONTH_3',
 'PAY_MONTH_4',
 'PAY_MONTH_5',
 'PAY_MONTH_6',
 'P_ASSETS_VALUE',
 'P_MONTHLY_INCOME',
 'RES_TYPE',
 'S_FLAG',
 'VISA_FLAG',
 'APP_SUB_TYPE=W',
 'APP_SUB_TYPE=Z',
 'APP_SUB_TYPE=C',
 'HOME_PHONE_FLAG=yes',
 'HOME_PHONE_FLAG=no',
 'MATE_PROF_CODE=nincs_adat',
 'MATE_PROF_CODE=0.0',
 'MATE_PROF_CODE=11.0',
 'MATE_PROF_CODE=9.0',
 'MATE_PROF_CODE=16.0',
 'MATE_PROF_CO

## Modellezés

In [117]:
train_df = df[df['train_or_test'] == 'train'].copy()
test_df = df[df['train_or_test'] == 'test'].copy()

In [118]:
train_df.columns

Index(['AGE', 'AME_FLAG', 'APP_SUB_TYPE', 'BILL_AMOUNT_MONTH_1',
       'BILL_AMOUNT_MONTH_2', 'BILL_AMOUNT_MONTH_3', 'BILL_AMOUNT_MONTH_4',
       'BILL_AMOUNT_MONTH_5', 'BILL_AMOUNT_MONTH_6', 'BIRTH_CITY',
       'BIRTH_STATE', 'CARS_FLAG', 'CONF_CITIZEN_CARD_NUM', 'CONF_TAX_STATUS',
       'C_NAME_GIVEN', 'C_PRODUCT', 'DAY_OF_PAYMENT', 'EDU_LEVEL',
       'EMAIL_FLAG', 'HOME_PHONE_FLAG', 'ID', 'L_BALANCE', 'MATE_PROF_CODE',
       'MCARD_FLAG', 'MONTHS_IN_RES', 'M_IN_THE_JOB', 'M_STATUS', 'NAT',
       'NUMBERO_OF_BANK_ACCOUNTS', 'NUMBER_OF_DEPENDANTS',
       'NUMBER_OF_SPECIAL_BANK_ACCOUNTS', 'OCARDS_FLAG', 'OCC_TYPE',
       'O_INCOMES', 'PAY_AMOUNT_MONTH_1', 'PAY_AMOUNT_MONTH_2',
       'PAY_AMOUNT_MONTH_3', 'PAY_AMOUNT_MONTH_4', 'PAY_AMOUNT_MONTH_5',
       'PAY_AMOUNT_MONTH_6', 'PAY_MONTH_0', 'PAY_MONTH_2', 'PAY_MONTH_3',
       'PAY_MONTH_4', 'PAY_MONTH_5', 'PAY_MONTH_6', 'PROF_BOROUGH',
       'PROF_CITY', 'PROF_CODE', 'PROF_PHONE_AREA_CODE', 'PROF_PHONE_GIVEN',
       'PROF

In [119]:
# model3
from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=15)
model3.fit(train_df[bemeno_valtozok], train_df['TARGET'])
prediction_rfc = model3.predict_proba(test_df[bemeno_valtozok])

In [120]:
prediction_rfc

array([[0.75284677, 0.24715323],
       [0.69588454, 0.30411546],
       [0.49202072, 0.50797928],
       ...,
       [0.61663784, 0.38336216],
       [0.43216475, 0.56783525],
       [0.57744462, 0.42255538]])

In [121]:
# model4
from sklearn.ensemble import GradientBoostingClassifier
model4 = GradientBoostingClassifier(random_state=42, n_estimators=150, max_depth=10)
model4.fit(train_df[bemeno_valtozok], train_df['TARGET'])
prediction_gbc = model4.predict_proba(test_df[bemeno_valtozok])

In [122]:
prediction_gbc

array([[0.84404847, 0.15595153],
       [0.60986941, 0.39013059],
       [0.09964556, 0.90035444],
       ...,
       [0.63343578, 0.36656422],
       [0.2161612 , 0.7838388 ],
       [0.88040804, 0.11959196]])

In [123]:
rfc_p1 = prediction_rfc[:, 1]
gbc_p1 = prediction_gbc[:, 1]

In [124]:
a = np.array([rfc_p1, gbc_p1])

In [125]:
a

array([[0.24715323, 0.30411546, 0.50797928, ..., 0.38336216, 0.56783525,
        0.42255538],
       [0.15595153, 0.39013059, 0.90035444, ..., 0.36656422, 0.7838388 ,
        0.11959196]])

In [126]:
b = np.mean(a, axis=0)

In [127]:
test_df['mean_p1'] = b

In [101]:
submission_df = test_df[['ID', 'mean_p1']]
submission_df.columns = ['Id', 'Predicted']
submission_df.to_csv('proba8.csv', index=False)

In [133]:
myxval_multi([model3, model4], train_df, bemeno_valtozok)

0.7486386563717712