In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Segédfüggvények

In [41]:
def limitalt_dummy_valtozo(indf, oszlop, limit, drop_last=True):
    uj_valtozok = []
    df = indf.copy()
    egyedi_elemszam=df[oszlop].nunique()
    if egyedi_elemszam <= limit and drop_last is True:
        egyedi_elemszam = egyedi_elemszam - 1
    tmp = df[oszlop].value_counts()[:limit]
    for value in list(tmp.index):
        def f(x):
            if x == value:
                return 1
            else:
                return 0
        df[oszlop+'='+str(value)] = df[oszlop].apply(f)
        uj_valtozok.append(oszlop + '=' + str(value))
    return df, uj_valtozok

In [42]:
def myxval(model,train_df,bemeno_valtozok,fold_num=3):
    train_df['xval']=train_df['ID']%fold_num
    auc_list=[]
    for i in range(fold_num):
        mini_train=train_df[train_df['xval']!=i].copy()
        mini_test=train_df[train_df['xval']==i].copy()
        model.fit(mini_train[bemeno_valtozok],mini_train['TARGET'])
        mini_test['p1']=model.predict_proba(mini_test[bemeno_valtozok])[:,1]
        auc=roc_auc_score(mini_test['TARGET'],mini_test['p1'])
        auc_list.append(auc)
    return np.mean(auc_list)

In [60]:
from sklearn.metrics import roc_auc_score

def myxval_multi(models,train_df,bemeno_valtozok,fold_num=3):
    train_df['xval']=train_df['ID']%fold_num
    auc_list=[]
    for i in range(fold_num):
        mini_train=train_df[train_df['xval']!=i].copy()
        mini_test=train_df[train_df['xval']==i].copy()
        predict_list = None
        for model in models:
            model.fit(mini_train[bemeno_valtozok],mini_train['TARGET'])
            predictions=model.predict_proba(mini_test[bemeno_valtozok])[:,1]
            predictions = predictions.reshape((1, len(predictions)))
            if predict_list is None:
                predict_list = predictions
            else:
                predict_list = np.concatenate([predict_list, predictions], axis=0)
        mini_test['p1'] = np.mean(predict_list, axis=0)
        auc=roc_auc_score(mini_test['TARGET'],mini_test['p1'])
        auc_list.append(auc)
    return np.mean(auc_list)

### --------------------------------------------------------------------- ###

In [44]:
train_data = pd.read_csv('./data/public_train.csv')
test_data = pd.read_csv('./data/public_test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [45]:
# Train or test?
train_data['train_or_test'] = 'train'
test_data['train_or_test'] = 'test'

In [46]:
df = pd.concat([train_data, test_data], sort=True)

# Hiányzó adatok

In [47]:
# Hianyzo adatok kitoltese
for att in df.columns:
    if att=="TARGET":
        pass
    #amik nem számok (string értékek stb...)
    elif df[att].dtype=="object": 
        #print(att)
        df[att]=df[att].fillna("nincs_adat")
    else:
        #test és train közös mediánja
        #ha van benne NaN, akkor azzal töltötte volna fel, de ez nem jó, ezért kell a nanmedian függvény
        #de nem a mediánra kéne itt kitölteni az fix!!!
        #df[att]=df[att].fillna(0) ez is lehetne
        df[att]=df[att].fillna(0)

# Float64 to int64

In [48]:
# float64 to int64
for att in df.columns:
    if att =="TARGET":
        pass
    elif df[att].dtype == "float64":
        df[att] = df[att].astype('int64')

# Fizetési adatok

In [49]:
df['MEAN_BILL_AMOUNT_MONTH'] = (df['BILL_AMOUNT_MONTH_1'] + df['BILL_AMOUNT_MONTH_2'] 
                                + df['BILL_AMOUNT_MONTH_3'] + df['BILL_AMOUNT_MONTH_4']
                                + df['BILL_AMOUNT_MONTH_5'] + df['BILL_AMOUNT_MONTH_6']) / 6

In [50]:
df['MEAN_PAY_MONTH'] = (df['PAY_MONTH_0'] + df['PAY_MONTH_2'] + df['PAY_MONTH_3'] 
                        + df['PAY_MONTH_4'] + df['PAY_MONTH_5'] + df['PAY_MONTH_6']) / 6

In [51]:
df['MEAN_PAY_AMOUNT_MONTH'] = (df['PAY_AMOUNT_MONTH_1'] + df['PAY_AMOUNT_MONTH_2'] 
                               + df['PAY_AMOUNT_MONTH_3'] + df['PAY_AMOUNT_MONTH_4']
                               + df['PAY_AMOUNT_MONTH_5'] + df['PAY_AMOUNT_MONTH_6']) / 6

# Változók szűrése

In [52]:
bemeno_valtozok = ['AGE', 'AME_FLAG', 'APP_SUB_TYPE', 'CARS_FLAG', 'C_PRODUCT', 'DAY_OF_PAYMENT', 'EDU_LEVEL',
       'EMAIL_FLAG', 'HOME_PHONE_FLAG', 'L_BALANCE', 'MATE_PROF_CODE',
       'MCARD_FLAG', 'MONTHS_IN_RES', 'M_IN_THE_JOB', 'M_STATUS', 'NAT',
       'NUMBERO_OF_BANK_ACCOUNTS', 'NUMBER_OF_DEPENDANTS',
       'NUMBER_OF_SPECIAL_BANK_ACCOUNTS', 'OCARDS_FLAG', 'OCC_TYPE',
       'O_INCOMES', 'PROF_BOROUGH', 'P_ADDRESS_TYPE', 'P_ASSETS_VALUE',
       'P_MONTHLY_INCOME', 'RES_BOROUGH', 'RES_TYPE', 'SEX', 'S_FLAG',
       'VISA_FLAG', 'MEAN_PAY_AMOUNT_MONTH', 'MEAN_PAY_MONTH', 'MEAN_BILL_AMOUNT_MONTH']

# Dummy változók

In [53]:
object_valtozok = []
uj_valtozok = []
for att in bemeno_valtozok:
    if df[att].dtype == 'object':
        df, val = limitalt_dummy_valtozo(df, att, 6)
        object_valtozok.append(att)
        uj_valtozok.extend(val)

In [54]:
bemeno_valtozok = bemeno_valtozok + uj_valtozok
bemeno_valtozok = [val for val in bemeno_valtozok if val not in object_valtozok]

In [55]:
bemeno_valtozok

['AGE',
 'AME_FLAG',
 'CARS_FLAG',
 'C_PRODUCT',
 'DAY_OF_PAYMENT',
 'EDU_LEVEL',
 'EMAIL_FLAG',
 'L_BALANCE',
 'MATE_PROF_CODE',
 'MCARD_FLAG',
 'MONTHS_IN_RES',
 'M_IN_THE_JOB',
 'M_STATUS',
 'NAT',
 'NUMBERO_OF_BANK_ACCOUNTS',
 'NUMBER_OF_DEPENDANTS',
 'NUMBER_OF_SPECIAL_BANK_ACCOUNTS',
 'OCARDS_FLAG',
 'OCC_TYPE',
 'O_INCOMES',
 'P_ASSETS_VALUE',
 'P_MONTHLY_INCOME',
 'RES_TYPE',
 'S_FLAG',
 'VISA_FLAG',
 'MEAN_PAY_AMOUNT_MONTH',
 'MEAN_PAY_MONTH',
 'MEAN_BILL_AMOUNT_MONTH',
 'APP_SUB_TYPE=W',
 'APP_SUB_TYPE=Z',
 'APP_SUB_TYPE=C',
 'HOME_PHONE_FLAG=yes',
 'HOME_PHONE_FLAG=no',
 'PROF_BOROUGH=nincs_adat',
 'PROF_BOROUGH=Borough0',
 'PROF_BOROUGH=Borough732',
 'PROF_BOROUGH=Borough48',
 'PROF_BOROUGH=Borough634',
 'PROF_BOROUGH=Borough315',
 'P_ADDRESS_TYPE=A',
 'P_ADDRESS_TYPE=B',
 'RES_BOROUGH=nincs_adat',
 'RES_BOROUGH=Borough0',
 'RES_BOROUGH=Borough48',
 'RES_BOROUGH=Borough116',
 'RES_BOROUGH=Borough101',
 'RES_BOROUGH=Borough152',
 'SEX=female',
 'SEX=male',
 'SEX=nincs_adat']

# Modellezés

In [56]:
train_df = df[df['train_or_test'] == 'train'].copy()
test_df = df[df['train_or_test'] == 'test'].copy()

In [57]:
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=15)

In [58]:
from sklearn.ensemble import GradientBoostingClassifier
model_gbc = GradientBoostingClassifier(random_state=42, n_estimators=150, max_depth=10)

In [61]:
myxval_multi([model_rfc, model_gbc], train_df, bemeno_valtozok)

0.7361476122023035

In [62]:
model_rfc.fit(train_df[bemeno_valtozok], train_df['TARGET'])
model_gbc.fit(train_df[bemeno_valtozok], train_df['TARGET'])

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=10,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=150,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [63]:
prediction_rfc = model_rfc.predict_proba(test_df[bemeno_valtozok])[:,1]
prediction_gbc = model_gbc.predict_proba(test_df[bemeno_valtozok])[:,1]

In [64]:
a = np.array([prediction_rfc, prediction_gbc])

In [65]:
test_df['p1'] = np.mean(a, axis=0)

In [67]:
submission_df = test_df[['ID', 'p1']]
submission_df.columns = ['Id', 'Predicted']
submission_df.to_csv('proba7.csv', index=False)