In [4]:
import csv
import pandas as pd
import numpy as np
import sys

from fancyimpute import MICE, BiScaler, KNN, NuclearNormMinimization, SoftImpute

from sklearn.linear_model import RandomizedLasso

def drop_low_var_quality_features(df, missing_values_threshold, std_threshold):
    """
    Params:
    missing_values_threshold: Maximum number of missing values for a column to be retained
    std_threshold: Minimum standard deviation for a column to be retained.
    """
    rows_count, cols_count = df.shape
    df.dropna(axis=1, thresh=rows_count-missing_values_threshold, inplace=True)
    df.drop(df.std()[df.std() < std_threshold].index.values, axis=1, inplace=True)

In [2]:
df = pd.read_csv('background.csv', low_memory=False)
df.cf4fint = ((pd.to_datetime(df.cf4fint) - pd.to_datetime('1960-01-01')) / np.timedelta64(1, 'D')).astype(int)

In [None]:
# number of nan cols
# print sum(df.isnull().sum().tolist())
# number of non-nan cols
# print df.count().sum()
# Total elements
# print df.size
# Shape
# rows_count, cols_count = df.shape
# print (rows_count, cols_count)

In [None]:
drop_low_var_quality_features(df, 200, 0.2)
print sorted(df.isnull().sum().tolist(), reverse=True)[:10]
print sorted(df.std())[:30]
print df.shape

In [6]:
y_train_df = pd.read_csv("train.csv", low_memory=False)

In [29]:
def get_data_for_characteristic(X_train, Y_train, characteristic, get_only_complete_cases=False):
    y_char = Y_train[np.isfinite(Y_train[characteristic])]
    
    training_ids = y_char['challengeID'].tolist()
    X_char = X_train[X_train['challengeID'].isin(training_ids)]
    X_char = X_char.sort_values(by='challengeID')
    y_char = y_char.sort_values(by='challengeID')
    assert(y_char['challengeID'].tolist() == X_char['challengeID'].tolist())
    
    non_numeric_cols = X_char.select_dtypes(exclude=[np.number]).columns.values.tolist()
    X_char.drop(non_numeric_cols, axis=1, inplace=True)
    
    if get_only_complete_cases is True:
        X_char = X_char.dropna(axis=0, inplace=False)

    return X_char, y_char[characteristic]

In [30]:
X_grit, y_grit = get_data_for_characteristic(df, y_train_df, 'grit')

In [33]:
print X_grit.shape

print X_grit_complete_cases.shape

(1418, 8412)
(1296, 8412)


In [None]:
X_grit_mat, y_grit_mat = X_grit.as_matrix(), y_grit.as_matrix()

In [None]:
thresholds = np.logspace(-4,-1,4)
lasso = RandomizedLasso(alpha='aic', random_state=39, n_resampling=500)
lasso.fit(X_grit_mat, y_grit_mat)
stability_scores = lasso.scores_
support = np.where(stability_scores > thresholds[0])[0]
print threshold, '\t', str(support)
Xf = X_grit.iloc[:,support]