In [1]:
#numpy and pandas for data manipulation
import numpy as np
import pandas as pd
import gc

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_predict

In [2]:
samples = pd.read_csv('training.csv')

In [5]:
samples.shape

(307507, 1331)

In [3]:
def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
    return df
        
        
samples = convert_types(samples)

In [7]:
round(samples.memory_usage().sum() / 1e9, 2)

1.62

In [4]:
EMPTY_THRESHOLD = 75
CORR_THRESHOLD = 0.9

In [5]:
def remove_missing_columns(train, threshold = 90):
    # Calculate missing stats for train
    train_miss = pd.DataFrame(train.isnull().sum())
    train_miss['percent'] = 100 * train_miss[0] / len(train)
    
    # list of missing columns for train and test
    missing_train_columns = list(train_miss.index[train_miss['percent'] > threshold])
    
    # Combine the two lists together
    missing_columns = list(set(missing_train_columns))
    
    # Print information
    print('There are %d columns with greater than %d%% missing values.' % (len(missing_columns), threshold))
    
    # Drop the missing columns and return
    return train.drop(columns = missing_columns)

samples = remove_missing_columns(samples, EMPTY_THRESHOLD)


There are 69 columns with greater than 75% missing values.


In [10]:
samples.shape

(307507, 1262)

In [13]:
samples['TARGET']

0          True
1         False
2         False
3         False
4         False
          ...  
307502    False
307503    False
307504    False
307505     True
307506    False
Name: TARGET, Length: 307507, dtype: bool

In [None]:
correlations = samples.drop(columns = ['TARGET']).corr().abs()


In [None]:
for c in samples.columns:
    if 'TARGET' in c:
        print(c)