In [2]:
#numpy and pandas for data manipulation
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold

from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_predict

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn import metrics

#sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

from itertools import product


samples = pd.read_csv('preprocessed-75.csv')

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in samples:
    if samples[col].dtype == 'bool':
        # If 2 or fewer unique categories
        if len(list(samples[col].unique())) <= 2:
            # Train on the training data
            le.fit(samples[col])
            # Transform 
            samples[col] = le.transform(samples[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
print('%d columns were label encoded.' % le_count)


train_label = samples['TARGET']
print(train_label)



# one-hot encoding of categorical variables
samples = pd.get_dummies(samples)
correlations = samples.corr()['TARGET'].abs().sort_values(ascending=False)
#keeping only 250 column that were highly correlated with target
samples = samples.drop(columns = correlations[250:].index.values.tolist())

8 columns were label encoded.
0         1
1         0
2         0
3         0
4         0
         ..
307502    0
307503    0
307504    0
307505    1
307506    0
Name: TARGET, Length: 307507, dtype: int64


In [3]:
samples.columns

Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'AMT_CREDIT', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH',
       ...
       'OCCUPATION_TYPE_Drivers', 'OCCUPATION_TYPE_Laborers',
       'OCCUPATION_TYPE_Low-skill Laborers',
       'ORGANIZATION_TYPE_Business Entity Type 3',
       'ORGANIZATION_TYPE_Self-employed', 'ORGANIZATION_TYPE_XNA',
       'FONDKAPREMONT_MODE_reg oper account', 'HOUSETYPE_MODE_block of flats',
       'WALLSMATERIAL_MODE_Panel', 'EMERGENCYSTATE_MODE_No'],
      dtype='object', length=250)

In [4]:
len(samples.columns)

250

In [5]:
samples['TARGET']

0         1
1         0
2         0
3         0
4         0
         ..
307502    0
307503    0
307504    0
307505    1
307506    0
Name: TARGET, Length: 307507, dtype: int64

In [6]:
samples.to_csv('features-250.csv',index=False)

In [16]:
def remove_missing_columns(train, threshold = 90):
    # Calculate missing stats for train
    train_miss = pd.DataFrame(train.isnull().sum())
    train_miss['percent'] = 100 * train_miss[0] / len(train)
    
    # list of missing columns for train and test
    missing_train_columns = list(train_miss.index[train_miss['percent'] > threshold])
    
    # Combine the two lists together
    missing_columns = list(set(missing_train_columns))
    
    # Print information
    print('There are %d columns with greater than %d%% missing values.' % (len(missing_columns), threshold))
    
    # Drop the missing columns and return
    return train.drop(columns = missing_columns)
new_samples = remove_missing_columns(samples, 70)

There are 57 columns with greater than 70% missing values.


In [19]:
len(new_samples.columns)

193

In [21]:
new_corr = new_samples.corr()['TARGET']

In [24]:
new_corr.sort_values()

EXT_SOURCE_3                                  -0.178926
EXT_SOURCE_2                                  -0.160471
EXT_SOURCE_1                                  -0.155317
previous_NAME_CONTRACT_STATUS_Approved_mean   -0.063526
client_installments_AMT_PAYMENT_min_sum       -0.058399
                                                 ...   
bureau_CREDIT_ACTIVE_Active_mean               0.077356
previous_NAME_CONTRACT_STATUS_Refused_mean     0.077681
DAYS_BIRTH                                     0.078242
bureau_DAYS_CREDIT_mean                        0.089731
TARGET                                         1.000000
Name: TARGET, Length: 193, dtype: float64

In [25]:
new_samples.to_csv('features-193.csv',index=False)