In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

### Segédfüggvények

In [6]:
def limitalt_dummy_valtozo(indf, oszlop, limit, drop_last=True):
    uj_valtozok = []
    df = indf.copy()
    egyedi_elemszam=df[oszlop].nunique()
    if egyedi_elemszam <= limit and drop_last is True:
        egyedi_elemszam = egyedi_elemszam - 1
    tmp = df[oszlop].value_counts()[:limit]
    for value in list(tmp.index):
        def f(x):
            if x == value:
                return 1
            else:
                return 0
        df[oszlop+'='+str(value)] = df[oszlop].apply(f)
        uj_valtozok.append(oszlop + '=' + str(value))
    return df, uj_valtozok

In [7]:
def myxval(model,train_df,bemeno_valtozok,fold_num=3):
    train_df['xval']=train_df['ID']%fold_num
    auc_list=[]
    for i in range(fold_num):
        mini_train=train_df[train_df['xval']!=i].copy()
        mini_test=train_df[train_df['xval']==i].copy()
        model.fit(mini_train[bemeno_valtozok],mini_train['TARGET'])
        mini_test['p1']=model.predict_proba(mini_test[bemeno_valtozok])[:,1]
        auc=roc_auc_score(mini_test['TARGET'],mini_test['p1'])
        auc_list.append(auc)
    return np.mean(auc_list)

In [8]:
from sklearn.metrics import roc_auc_score

def myxval_multi(models,train_df,bemeno_valtozok,fold_num=3):
    train_df['xval']=train_df['ID']%fold_num
    auc_list=[]
    for i in range(fold_num):
        mini_train=train_df[train_df['xval']!=i].copy()
        mini_test=train_df[train_df['xval']==i].copy()
        predict_list = None
        for model in models:
            model.fit(mini_train[bemeno_valtozok],mini_train['TARGET'])
            predictions=model.predict_proba(mini_test[bemeno_valtozok])[:,1]
            predictions = predictions.reshape((1, len(predictions)))
            if predict_list is None:
                predict_list = predictions
            else:
                predict_list = np.concatenate([predict_list, predictions], axis=0)
        mini_test['p1'] = np.mean(predict_list, axis=0)
        auc=roc_auc_score(mini_test['TARGET'],mini_test['p1'])
        auc_list.append(auc)
    return np.mean(auc_list)

### --------------------------------------------------------------------- ###

In [9]:
train_data = pd.read_csv('./data/public_train.csv')
test_data = pd.read_csv('./data/public_test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [10]:
# Train or test?
train_data['train_or_test'] = 'train'
test_data['train_or_test'] = 'test'

In [11]:
df = pd.concat([train_data, test_data], sort=True)

# Városi adatok

In [12]:
nonan = train_data.copy()
nonan['RES_CITY'] = nonan['RES_CITY'].fillna('nincs_adat')

In [13]:
seged = nonan.groupby(['RES_CITY'], as_index=False).agg({'TARGET':'mean', 'ID':'count'})
seged.columns = ['RES_CITY', 'MEAN_RES_CITY', 'COUNT']

In [14]:
mean = seged['MEAN_RES_CITY'].mean()
seged['MEAN_RES_CITY'] = seged['MEAN_RES_CITY'].fillna(mean)
for index, row in seged.iterrows():
    if row['COUNT'] < 6:
        seged.loc[index, 'MEAN_RES_CITY'] = mean

In [15]:
df_merged = df.merge(seged[['RES_CITY', 'MEAN_RES_CITY']], on='RES_CITY', how='left')
df_merged['MEAN_RES_CITY'] = df_merged['MEAN_RES_CITY'].fillna(mean)

In [16]:
df_merged
df_merged.describe()

Unnamed: 0,AGE,AME_FLAG,BILL_AMOUNT_MONTH_1,BILL_AMOUNT_MONTH_2,BILL_AMOUNT_MONTH_3,BILL_AMOUNT_MONTH_4,BILL_AMOUNT_MONTH_5,BILL_AMOUNT_MONTH_6,CARS_FLAG,CONF_CITIZEN_CARD_NUM,...,PAY_MONTH_5,PAY_MONTH_6,PROF_CODE,P_ASSETS_VALUE,P_MONTHLY_INCOME,RES_TYPE,S_FLAG,TARGET,VISA_FLAG,MEAN_RES_CITY
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,42244.0,50000.0,49472.0,48651.0,50000.0,25000.0,50000.0,50000.0
mean,43.24852,0.00174,60641.67008,57355.32426,53292.2359,48868.05748,45637.25712,44628.47876,0.33614,0.0,...,-0.2153,-0.24816,8.061784,445149.1,180633.12369,1.252225,0.00132,0.42432,0.11144,0.421255
std,14.98905,0.041677,72452.016459,69798.265537,66190.142456,63214.005733,61091.066392,60077.818698,0.472392,0.0,...,1.045413,1.069317,3.220104,2281880.0,134589.611245,0.867833,0.036308,0.494249,0.314679,0.088352
min,6.0,0.0,-165580.0,-33350.0,-61506.0,-65167.0,-37594.0,-51443.0,0.0,0.0,...,-2.0,-2.0,0.0,0.0,10000.0,0.0,0.0,0.0,0.0,0.0
25%,31.0,0.0,6136.0,4989.0,4144.0,3517.5,2475.0,2160.75,0.0,0.0,...,-1.0,-1.0,9.0,0.0,94000.0,1.0,0.0,0.0,0.0,0.389671
50%,41.0,0.0,43990.0,40679.0,34859.0,28698.0,26152.0,25833.0,0.0,0.0,...,0.0,0.0,9.0,0.0,133200.0,1.0,0.0,0.0,0.0,0.405204
75%,53.0,0.0,86422.0,82789.0,78366.0,70899.0,66895.0,65239.0,1.0,0.0,...,0.0,0.0,9.0,0.0,215576.0,1.0,0.0,1.0,0.0,0.455285
max,106.0,1.0,964511.0,983931.0,693131.0,891586.0,927171.0,961664.0,1.0,0.0,...,7.0,7.0,18.0,16836110.0,999976.4,5.0,1.0,1.0,1.0,0.916667


In [17]:
nonan = train_data.copy()
nonan['PROF_CITY'] = nonan['PROF_CITY'].fillna('nincs_adat')

In [18]:
seged = nonan.groupby(['PROF_CITY'], as_index=False).agg({'TARGET':'mean', 'ID':'count'})
seged.columns = ['PROF_CITY', 'MEAN_PROF_CITY', 'COUNT']

In [19]:
mean = seged['MEAN_PROF_CITY'].mean()
seged['MEAN_PROF_CITY'] = seged['MEAN_PROF_CITY'].fillna(mean)
for index, row in seged.iterrows():
    if row['COUNT'] < 6:
        seged.loc[index, 'MEAN_PROF_CITY'] = mean

In [20]:
df_merged2 = df_merged.merge(seged[['PROF_CITY', 'MEAN_PROF_CITY']], on='PROF_CITY', how='left')
df_merged2['MEAN_PROF_CITY'] = df_merged2['MEAN_PROF_CITY'].fillna(mean)

In [21]:
df_merged2

Unnamed: 0,AGE,AME_FLAG,APP_SUB_TYPE,BILL_AMOUNT_MONTH_1,BILL_AMOUNT_MONTH_2,BILL_AMOUNT_MONTH_3,BILL_AMOUNT_MONTH_4,BILL_AMOUNT_MONTH_5,BILL_AMOUNT_MONTH_6,BIRTH_CITY,...,RES_STATE,RES_TYPE,RES_ZIP,SEX,S_FLAG,TARGET,VISA_FLAG,train_or_test,MEAN_RES_CITY,MEAN_PROF_CITY
0,32,0,W,4181,25988,3177,49174,11823,669,City0,...,State0,1.0,595,female,0,1.0,1,train,0.405204,0.398107
1,34,0,C,5100,9861,11021,16883,150,4661,City1,...,State1,1.0,230,female,0,1.0,0,train,0.371105,0.398107
2,61,0,W,97989,101458,103840,105646,86127,67269,City3,...,State2,,545,female,0,0.0,0,train,0.520000,0.398107
3,36,0,W,44970,48101,46993,47940,50903,49794,City9,...,State7,1.0,607,female,0,1.0,0,train,0.455285,0.398107
4,29,0,W,390,1780,0,0,0,0,City10,...,State3,1.0,384,female,0,0.0,0,train,0.350877,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,59,0,W,128671,129901,127658,127362,128726,129081,,...,State21,1.0,725,female,0,,0,test,0.555556,0.398107
49996,38,0,W,70381,68660,70081,70899,75331,74005,,...,State7,1.0,635,female,0,,0,test,0.368421,0.398107
49997,37,0,Z,5988,0,0,0,0,0,City32,...,State0,2.0,591,female,0,,0,test,0.405204,0.398107
49998,68,0,W,6954,4110,1030,357,357,370,City1442,...,State5,1.0,882,male,0,,0,test,0.423077,0.398107


# Hiányzó adatok

In [22]:
# Hianyzo adatok kitoltese
for att in df.columns:
    if att=="TARGET":
        pass
    #amik nem számok (string értékek stb...)
    elif df_merged2[att].dtype=="object": 
        #print(att)
        df_merged2[att]=df_merged2[att].fillna("nincs_adat")
    else:
        #test és train közös mediánja
        #ha van benne NaN, akkor azzal töltötte volna fel, de ez nem jó, ezért kell a nanmedian függvény
        #de nem a mediánra kéne itt kitölteni az fix!!!
        #df[att]=df[att].fillna(0) ez is lehetne
        df_merged2[att]=df_merged2[att].fillna(0)

In [23]:
bemeno_valtozok = ['AGE', 'AME_FLAG', 'APP_SUB_TYPE', 'BILL_AMOUNT_MONTH_1',
       'BILL_AMOUNT_MONTH_2', 'BILL_AMOUNT_MONTH_3', 'BILL_AMOUNT_MONTH_4',
       'BILL_AMOUNT_MONTH_5', 'BILL_AMOUNT_MONTH_6', 'CARS_FLAG', 'C_PRODUCT', 'DAY_OF_PAYMENT', 'EDU_LEVEL',
       'EMAIL_FLAG', 'HOME_PHONE_FLAG', 'L_BALANCE', 'MATE_PROF_CODE',
       'MCARD_FLAG', 'MONTHS_IN_RES', 'M_IN_THE_JOB', 'M_STATUS', 'NAT',
       'NUMBERO_OF_BANK_ACCOUNTS', 'NUMBER_OF_DEPENDANTS',
       'NUMBER_OF_SPECIAL_BANK_ACCOUNTS', 'OCARDS_FLAG', 'OCC_TYPE',
       'O_INCOMES', 'PAY_AMOUNT_MONTH_1', 'PAY_AMOUNT_MONTH_2',
       'PAY_AMOUNT_MONTH_3', 'PAY_AMOUNT_MONTH_4', 'PAY_AMOUNT_MONTH_5',
       'PAY_AMOUNT_MONTH_6', 'PAY_MONTH_0', 'PAY_MONTH_2', 'PAY_MONTH_3',
       'PAY_MONTH_4', 'PAY_MONTH_5', 'PAY_MONTH_6', 'P_ADDRESS_TYPE', 'P_ASSETS_VALUE',
       'P_MONTHLY_INCOME', 'RES_TYPE', 'SEX', 'S_FLAG', 
       'VISA_FLAG', 'MEAN_RES_CITY', 'MEAN_PROF_CITY']

In [24]:
df_merged2['MEAN_RES_CITY']

0        0.405204
1        0.371105
2        0.520000
3        0.455285
4        0.350877
           ...   
49995    0.555556
49996    0.368421
49997    0.405204
49998    0.423077
49999    0.405204
Name: MEAN_RES_CITY, Length: 50000, dtype: float64

In [25]:
object_valtozok = []
uj_valtozok = []
for att in bemeno_valtozok:
    if df_merged2[att].dtype == 'object':
        df_merged2, val = limitalt_dummy_valtozo(df_merged2, att, 6)
        object_valtozok.append(att)
        uj_valtozok.extend(val)

In [26]:
bemeno_valtozok = bemeno_valtozok + uj_valtozok
bemeno_valtozok = [val for val in bemeno_valtozok if val not in object_valtozok]

In [27]:
bemeno_valtozok

['AGE',
 'AME_FLAG',
 'BILL_AMOUNT_MONTH_1',
 'BILL_AMOUNT_MONTH_2',
 'BILL_AMOUNT_MONTH_3',
 'BILL_AMOUNT_MONTH_4',
 'BILL_AMOUNT_MONTH_5',
 'BILL_AMOUNT_MONTH_6',
 'CARS_FLAG',
 'C_PRODUCT',
 'DAY_OF_PAYMENT',
 'EDU_LEVEL',
 'EMAIL_FLAG',
 'L_BALANCE',
 'MATE_PROF_CODE',
 'MCARD_FLAG',
 'MONTHS_IN_RES',
 'M_IN_THE_JOB',
 'M_STATUS',
 'NAT',
 'NUMBERO_OF_BANK_ACCOUNTS',
 'NUMBER_OF_DEPENDANTS',
 'NUMBER_OF_SPECIAL_BANK_ACCOUNTS',
 'OCARDS_FLAG',
 'OCC_TYPE',
 'O_INCOMES',
 'PAY_AMOUNT_MONTH_1',
 'PAY_AMOUNT_MONTH_2',
 'PAY_AMOUNT_MONTH_3',
 'PAY_AMOUNT_MONTH_4',
 'PAY_AMOUNT_MONTH_5',
 'PAY_AMOUNT_MONTH_6',
 'PAY_MONTH_0',
 'PAY_MONTH_2',
 'PAY_MONTH_3',
 'PAY_MONTH_4',
 'PAY_MONTH_5',
 'PAY_MONTH_6',
 'P_ASSETS_VALUE',
 'P_MONTHLY_INCOME',
 'RES_TYPE',
 'S_FLAG',
 'VISA_FLAG',
 'MEAN_RES_CITY',
 'MEAN_PROF_CITY',
 'APP_SUB_TYPE=W',
 'APP_SUB_TYPE=Z',
 'APP_SUB_TYPE=C',
 'HOME_PHONE_FLAG=yes',
 'HOME_PHONE_FLAG=no',
 'P_ADDRESS_TYPE=A',
 'P_ADDRESS_TYPE=B',
 'SEX=female',
 'SEX=male',

# Anomália detekció

In [28]:
train_df = df_merged2[df_merged2['train_or_test'] == 'train'].copy()
test_df = df_merged2[df_merged2['train_or_test'] == 'test'].copy()

In [29]:
from sklearn.ensemble import IsolationForest

In [30]:
model = IsolationForest()

In [31]:
model.fit(train_df[bemeno_valtozok])
train_df['anomaly'] = model.predict(train_df[bemeno_valtozok])

In [32]:
train_df = train_df[train_df['anomaly'] == 1].copy()

## Modellezés

In [33]:
train_df.columns

Index(['AGE', 'AME_FLAG', 'APP_SUB_TYPE', 'BILL_AMOUNT_MONTH_1',
       'BILL_AMOUNT_MONTH_2', 'BILL_AMOUNT_MONTH_3', 'BILL_AMOUNT_MONTH_4',
       'BILL_AMOUNT_MONTH_5', 'BILL_AMOUNT_MONTH_6', 'BIRTH_CITY',
       'BIRTH_STATE', 'CARS_FLAG', 'CONF_CITIZEN_CARD_NUM', 'CONF_TAX_STATUS',
       'C_NAME_GIVEN', 'C_PRODUCT', 'DAY_OF_PAYMENT', 'EDU_LEVEL',
       'EMAIL_FLAG', 'HOME_PHONE_FLAG', 'ID', 'L_BALANCE', 'MATE_PROF_CODE',
       'MCARD_FLAG', 'MONTHS_IN_RES', 'M_IN_THE_JOB', 'M_STATUS', 'NAT',
       'NUMBERO_OF_BANK_ACCOUNTS', 'NUMBER_OF_DEPENDANTS',
       'NUMBER_OF_SPECIAL_BANK_ACCOUNTS', 'OCARDS_FLAG', 'OCC_TYPE',
       'O_INCOMES', 'PAY_AMOUNT_MONTH_1', 'PAY_AMOUNT_MONTH_2',
       'PAY_AMOUNT_MONTH_3', 'PAY_AMOUNT_MONTH_4', 'PAY_AMOUNT_MONTH_5',
       'PAY_AMOUNT_MONTH_6', 'PAY_MONTH_0', 'PAY_MONTH_2', 'PAY_MONTH_3',
       'PAY_MONTH_4', 'PAY_MONTH_5', 'PAY_MONTH_6', 'PROF_BOROUGH',
       'PROF_CITY', 'PROF_CODE', 'PROF_PHONE_AREA_CODE', 'PROF_PHONE_GIVEN',
       'PROF

In [34]:
# model3
from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=15)
model3.fit(train_df[bemeno_valtozok], train_df['TARGET'])
prediction_rfc = model3.predict_proba(test_df[bemeno_valtozok])

In [35]:
prediction_rfc

array([[0.78314538, 0.21685462],
       [0.64275969, 0.35724031],
       [0.6243021 , 0.3756979 ],
       ...,
       [0.59993913, 0.40006087],
       [0.3973744 , 0.6026256 ],
       [0.5657942 , 0.4342058 ]])

In [36]:
# model4
from sklearn.ensemble import GradientBoostingClassifier
model4 = GradientBoostingClassifier(random_state=42, n_estimators=150, max_depth=10)
model4.fit(train_df[bemeno_valtozok], train_df['TARGET'])
prediction_gbc = model4.predict_proba(test_df[bemeno_valtozok])

In [37]:
prediction_gbc

array([[0.89952025, 0.10047975],
       [0.57184528, 0.42815472],
       [0.11727491, 0.88272509],
       ...,
       [0.73852827, 0.26147173],
       [0.22905331, 0.77094669],
       [0.45706088, 0.54293912]])

In [38]:
rfc_p1 = prediction_rfc[:, 1]
gbc_p1 = prediction_gbc[:, 1]

In [39]:
a = np.array([rfc_p1, gbc_p1])

In [40]:
a

array([[0.21685462, 0.35724031, 0.3756979 , ..., 0.40006087, 0.6026256 ,
        0.4342058 ],
       [0.10047975, 0.42815472, 0.88272509, ..., 0.26147173, 0.77094669,
        0.54293912]])

In [41]:
b = np.mean(a, axis=0)

In [42]:
test_df['mean_p1'] = b

In [43]:
submission_df = test_df[['ID', 'mean_p1']]
submission_df.columns = ['Id', 'Predicted']
submission_df.to_csv('proba9.csv', index=False)

In [133]:
myxval_multi([model3, model4], train_df, bemeno_valtozok)

0.7486386563717712