In [19]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from pandas.api.types import is_numeric_dtype

In [20]:
train_data = pd.read_csv('data/train_fact.csv')
ext_df = pd.read_csv('data/external_data.csv')
prev_df = pd.read_csv('data/prev_filtered.csv')

# Data preparation

In [22]:
train_data['income_per_child'] = train_data['AMT_INCOME_TOTAL'] / (train_data['CNT_CHILDREN'] + 1)

In [23]:
prev_agg = prev_df.groupby('SK_ID_CURR').agg({
    'AMT_CREDIT': ['min', 'max', 'mean', 'sum']
})
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

In [24]:
ext_agg = ext_df.groupby('SK_ID_CURR').agg({
    'AMT_CREDIT_SUM': ['min', 'max', 'mean', 'sum']
})
ext_agg.columns = pd.Index(
    ['EXT_' + e[0] + "_" + e[1].upper() for e in ext_agg.columns.tolist()])

In [25]:
train_data = pd.merge(train_data, prev_agg, how='left', on='SK_ID_CURR')
train_data = pd.merge(train_data, ext_agg, how='left', on='SK_ID_CURR')

In [26]:
features =  list(ext_agg.columns) + list(prev_agg.columns) + [col for col in train_data.columns if is_numeric_dtype(train_data[col])]

In [27]:
X = train_data[features]
X = X.drop('TARGET', axis=1)
y = train_data.TARGET
X = X.fillna(0)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Let's train our model

In [30]:
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
y_preds = logreg.predict_proba(X_test)[:,1]

In [33]:
roc_auc_score(y_test, y_preds)

0.5839480452451165

# So what have we done?

    We hold out 25% of the train set and do nothing with it
    We train the logistic regression on the other 75% of the data and create a model
    We make predictions on the 25% of the data based on this model
    We evaluate based on rocauc score wether or not the decision tree made the correct prediction


# Your turn!

- Add features from the previous step
- Test out different models
- Add more features
- Iterate, explore, create features, test your validation again

Good luck!
