In [1]:
import csv
import pandas as pd
import numpy as np
import sys

from fancyimpute import MICE, BiScaler, KNN, NuclearNormMinimization, SoftImpute

In [2]:
df = pd.read_csv('background.csv', low_memory=False)
df.cf4fint = ((pd.to_datetime(df.cf4fint) - pd.to_datetime('1960-01-01')) / np.timedelta64(1, 'D')).astype(int)

In [7]:
def drop_low_var_quality_features(df, missing_values_threshold, std_threshold):
    """
    Params:
    missing_values_threshold: Maximum number of missing values for a column to be retained
    std_threshold: Minimum standard deviation for a column to be retained.
    """
    rows_count, cols_count = df.shape
    df.dropna(axis=1, thresh=rows_count-missing_values_threshold, inplace=True)
    df.drop(df.std()[df.std() < std_threshold].index.values, axis=1, inplace=True)

In [None]:
# number of nan cols
print sum(df.isnull().sum().tolist())
# number of non-nan cols
print df.count().sum()
# Total elements
print df.size
# Shape
rows_count, cols_count = df.shape
print (rows_count, cols_count)

In [None]:
# print sorted(df.isnull().sum().tolist(), reverse=True)[:10]
fuller = df.dropna(axis=1, thresh=rows_count-200, inplace=False)
# print sorted(fuller.isnull().sum().tolist(), reverse=True)[:10]
# number of nan cols
print sum(fuller.isnull().sum().tolist())
print fuller.shape

In [None]:
threshold = 0.2

high_dev=fuller.drop(fuller.std()[fuller.std() < threshold].index.values, axis=1)
high_dev.shape

In [None]:
high_dev.shape
print sum(high_dev.isnull().sum().tolist())

In [5]:
print sorted(df.isnull().sum().tolist(), reverse=True)[:10]
print sorted(df.std())[:300]
print df.shape

[4242, 4242, 4242, 4242, 4242, 4242, 4242, 4242, 4242, 4242]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [8]:
drop_low_var_quality_features(df, 200, 0.2)
print sorted(df.isnull().sum().tolist(), reverse=True)[:10]
print sorted(df.std())[:300]
print df.shape

[109, 109, 109, 109, 109, 109, 109, 109, 109, 109]
[0.20646126040539606, 0.22672438303586487, 0.23732405529262637, 0.28188827806328803, 0.29183226129759943, 0.3094809223841305, 0.31956327177241473, 0.33146906498108625, 0.33498349930822158, 0.33653339600207366, 0.34310115763896304, 0.3595688982512068, 0.37360428090397396, 0.37905104699471293, 0.38612713315902258, 0.38852297909439037, 0.40129862466140814, 0.40140284284664379, 0.40647016972637673, 0.41111127871870828, 0.41254408007960774, 0.41453877252573379, 0.41473422944819527, 0.43434899193154725, 0.43475192360299614, 0.43932251474690198, 0.44091612147632681, 0.4416197132718005, 0.44387878980275919, 0.44673901740559829, 0.45168791614592074, 0.45689880630135898, 0.45824982944447651, 0.4595229784919071, 0.46147185967241877, 0.46205977520978525, 0.46268538894936456, 0.46357312755446145, 0.46520314883803338, 0.46589183653252964, 0.47075028712513095, 0.47726366358034794, 0.47749040454597119, 0.47773474826978723, 0.4839457971145551, 0.487577

In [9]:
Y_train = pd.read_csv("train.csv", low_memory=False)

In [14]:
'challengeID' in list(df)

True

In [17]:
training_ids = Y_train['challengeID'].tolist()

In [23]:
X_train = df[df['challengeID'].isin(training_ids)]

In [30]:
X_train_sorted = X_train.sort_values(by='challengeID')
Y_train_sorted = Y_train.sort_values(by='challengeID')
X_train = X_train_sorted
Y_train = Y_train_sorted
assert(Y_train['challengeID'].tolist() == X_train['challengeID'].tolist())