In [None]:
import csv
import pandas as pd
import numpy as np
import sys

from fancyimpute import MICE, BiScaler, KNN, NuclearNormMinimization, SoftImpute

In [None]:
df = pd.read_csv('background.csv', low_memory=False)
df.cf4fint = ((pd.to_datetime(df.cf4fint) - pd.to_datetime('1960-01-01')) / np.timedelta64(1, 'D')).astype(int)

In [None]:
def drop_low_var_quality_features(df, missing_values_threshold, std_threshold):
    """
    Params:
    missing_values_threshold: Maximum number of missing values for a column to be retained
    std_threshold: Minimum standard deviation for a column to be retained.
    """
    rows_count, cols_count = df.shape
    df.dropna(axis=1, thresh=rows_count-missing_values_threshold, inplace=True)
    df.drop(df.std()[df.std() < std_threshold].index.values, axis=1, inplace=True)

In [None]:
# number of nan cols
print sum(df.isnull().sum().tolist())
# number of non-nan cols
print df.count().sum()
# Total elements
print df.size
# Shape
rows_count, cols_count = df.shape
print (rows_count, cols_count)

In [None]:
# print sorted(df.isnull().sum().tolist(), reverse=True)[:10]
fuller = df.dropna(axis=1, thresh=rows_count-200, inplace=False)
# print sorted(fuller.isnull().sum().tolist(), reverse=True)[:10]
# number of nan cols
print sum(fuller.isnull().sum().tolist())
print fuller.shape

In [None]:
threshold = 0.2

high_dev=fuller.drop(fuller.std()[fuller.std() < threshold].index.values, axis=1)
high_dev.shape

In [None]:
high_dev.shape
print sum(high_dev.isnull().sum().tolist())

In [None]:
print sorted(df.isnull().sum().tolist(), reverse=True)[:10]
print sorted(df.std())[:300]
print df.shape

In [None]:
drop_low_var_quality_features(df, 200, 0.2)
print sorted(df.isnull().sum().tolist(), reverse=True)[:10]
print sorted(df.std())[:300]
print df.shape

In [None]:
Y_train = pd.read_csv("train.csv", low_memory=False)

In [None]:
'challengeID' in list(df)

In [None]:
training_ids = Y_train['challengeID'].tolist()

In [None]:
X_train = df[df['challengeID'].isin(training_ids)]

In [None]:
X_train_sorted = X_train.sort_values(by='challengeID')
Y_train_sorted = Y_train.sort_values(by='challengeID')
X_train = X_train_sorted
Y_train = Y_train_sorted
assert(Y_train['challengeID'].tolist() == X_train['challengeID'].tolist())

In [None]:
def get_data_for_characteristic(X_train, Y_train, characteristic):
    all_char = Y_train[characteristic] #This is a Series

    #Remove rows where grit is NA
    char_defined = np.where(all_char.notnull())
    char = all_char.iloc[char_defined]
    X_train_char = X_train.iloc[char_defined]

    return X_train_char, char

In [None]:
non_numeric_cols = X_train.select_dtypes(exclude=[np.number]).columns.values.tolist()
X_train.drop(non_numeric_cols, axis=1, inplace=True)
X_grit, y_grit = get_data_for_characteristic(X_train, Y_train, 'grit')

In [None]:
print X_grit.shape
X_grit_complete_cases = X_grit.dropna(axis=0, inplace=False)
X_grit_complete_cases.shape

In [None]:
X_grit_mat, y_grit_mat = X_grit.as_matrix(), y_grit.as_matrix()

In [None]:
from sklearn.linear_model import RandomizedLasso

In [None]:
thresholds = np.logspace(-4,-1,4)
lasso = RandomizedLasso(alpha='aic', random_state=39, n_resampling=500)
lasso.fit(X_grit_mat, y_grit_mat)
stability_scores = lasso.scores_
support = np.where(stability_scores > thresholds[0])[0]
print threshold, '\t', str(support)
Xf = X_grit.iloc[:,support]