In [None]:
import pandas as pd

def read_data(filename, sample_size=None):
    df = pd.read_csv(filename)
    if sample_size is not None:
        df = df.sample(sample_size)
    return df

In [None]:
def transform_data(df):
    import numpy as np
    
    y = [] # collect targets
    data = [] # data (all columns except the target)

    target_col = 'TARGET'
    features = list([x for x in train_df.columns if x != target_col])

    for row in train_df.to_dict('records'):
        y.append(row[target_col])
        data.append({k: row[k] for k in features})
    
    return data, np.array(y)

In [None]:
def split_data(data, y):
    # train-test split
    from sklearn.model_selection import train_test_split

    data_train, data_val, y_train, y_val = train_test_split(data, y, train_size=0.8, stratify=y)
    print(f'data_train: {len(data_train)}')
    print(f'data_val: {len(data_val)}')
    
    return data_train, data_val, y_train, y_val

In [None]:
def process_train_data(data_train):
    from sklearn.preprocessing import Imputer
    from sklearn.preprocessing import MaxAbsScaler
    from sklearn.feature_extraction import DictVectorizer
      
    vectorizer = DictVectorizer()
    X_train = vectorizer.fit_transform(data_train)
  
    # fill in nan values
    imputer = Imputer()
    X_train = imputer.fit_transform(X_train)

    # scaling data by columns so different features have roughly the same magnitude
    scaler = MaxAbsScaler()
    X_train = scaler.fit_transform(X_train)
    
    return X_train, (vectorizer, imputer, scaler) # need to reuse these preprocessors on test data

In [None]:
def process_test_data(data_test, processors):
    X_test = None
    for processor in processors:
        X_test = processor.transform(X_test if X_test is not None else data_test)
        
    return X_test

In [None]:
def process_data(data_train, data_val, y_train, y_val):
    X_train, processors = process_train_data(data_train)
    X_val = process_test_data(data_val, processors=processors)
    
    return X_train, X_val, y_train, y_val

In [None]:
def plot_roc_curve(fpr, tpr, roc_auc):
    import matplotlib.pyplot as plt
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
def train_predict(X_train, X_val, y_train, y_val):
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import GradientBoostingClassifier
    import time

    # fit model
    model = LogisticRegression(class_weight='balanced')
  
    start = time.time()
    print(f'Fitting model on {X_train.shape[0]} samples...')
    model.fit(X_train, y_train)
    
    end = time.time()
    print('Finished model training in %.3f seconds.' % (end - start))

    # compute area under ROC
    # we need probabilities to do this
    pos_idx = list(model.classes_).index(1)
    y_score = model.predict_proba(X_val)[:, pos_idx]
    return y_score

In [None]:
filename = 'data/application_train.csv'
train_df = read_data(filename=filename, sample_size=10000)
train_df.head(20)
data, y = transform_data(train_df)
data_train, data_val, y_train, y_val = split_data(data, y)    
X_train, X_val, y_train, y_val = process_data(data_train, data_val, y_train, y_val)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

y_score = train_predict(X_train, X_val, y_train, y_val)
roc_auc = roc_auc_score(y_val, y_score)
fpr, tpr, _ = roc_curve(y_val, y_score, pos_label=1)

plot_roc_curve(fpr, tpr, roc_auc)
print(f'Area under ROC: {roc_auc}')

In [None]:
train_filename = 'data/application_train.csv'
train_df = read_data(filename=train_filename, sample_size=10000)
data_train, y_train = transform_data(train_df)
X_train, processors = process_train_data(data_train)

test_filename = 'data/application_test.csv'
test_df = read_data(filename=test_filename)
data_test, y_test = transform_data(test_df)
X_test = process_test_data(data_test, processors=processors)

In [None]:
# Use all sampled data for training
# and predict on the test data
y_score = train_predict(X_train, X_test, y_train, y_test)

In [None]:
predictions = []
for i, y_pred in enumerate(y_score):
    predictions.append({'SK_ID_CURR': data_test[i]['SK_ID_CURR'], 'TARGET': y_pred})

out_df = pd.DataFrame(data=predictions)
out_df