In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import pickle

# Function definition

In [2]:
def manage_current(current):
    current.replace({'good_bad_flag': {'Good': 1, 'Bad': 0}},inplace=True)
    current.drop(['systemloanid','approveddate','creationdate'],axis=1,inplace=True)
    current['referredby'] = [1 if isinstance(s, str) else 0 for s in current['referredby']]
    return current

today = datetime.today()
pd.set_option('display.width', None)

def manage_demograficos(demograficos):
    demograficos['edad'] = demograficos['birthdate'].apply(
        lambda x: today.year - x.year -
                  ((today.month, today.day) < (x.month, x.day)))
    del demograficos['birthdate']
    demograficos.replace(np.nan, "UNKNOWN",inplace=True)
    categorical_cols = ['bank_account_type', 'bank_name_clients', 'bank_branch_clients', 'employment_status_clients','level_of_education_clients']
    # return pd.get_dummies(demograficos, columns=categorical_cols)
    return demograficos

def get_values(df):
    times_loaned = len(df)
    times_late = len(df[df['is_late'] == 1])
    times_referred = len(df.dropna())
    return times_loaned, times_late, times_referred


def manage_previous(previous):
    # Greater than 0 if not ok, negative if  ok
    previous['minutes_late'] = (previous.firstrepaiddate - previous.firstduedate) / pd.Timedelta(minutes=1)
    previous['is_late'] = [0 if x < 0 else 1 for x in previous.minutes_late]

    res = previous.groupby("customerid").apply(get_values).reset_index()
    res['times_loaned'], res['times_late'], res['times_referred'] = zip(*res[0])
    del res[0]
    return res

# Read raw data and preprocess it

In [4]:
concat = pd.DataFrame()
encoder = None
for set in ["train","test"]:
    performance = pd.read_csv(rf"https://raw.githubusercontent.com/medinaltbx/G6_DP3/master/data/input/raw_data/{set}/{set}_performance.csv",decimal=",")
    performance = manage_current(performance)

    demograficos = pd.read_csv(rf"https://raw.githubusercontent.com/medinaltbx/G6_DP3/master/data/input/raw_data/{set}/{set}_datos_demograficos.csv",
                                parse_dates=['birthdate'],decimal=".")
    demograficos = manage_demograficos(demograficos)

    demo_perf = demograficos.merge(performance, on="customerid", how="inner")

    previous = pd.read_csv(rf"https://raw.githubusercontent.com/medinaltbx/G6_DP3/master/data/input/raw_data/{set}/{set}_previous_loan.csv",parse_dates=['firstduedate','firstrepaiddate'],decimal=".")
    previous = manage_previous(previous)

    merged = demo_perf.merge(previous, on="customerid", how="left")
    merged.drop(["longitude_gps", "latitude_gps"],axis=1,inplace=True)
    str_cols = ["loanamount", "totaldue"]
    merged[str_cols] = merged[str_cols].apply(pd.to_numeric)

    concat = pd.concat([concat,merged],axis=0)

# One Hot encoding categorical features
categorical = ["bank_account_type","bank_name_clients","bank_branch_clients","employment_status_clients","level_of_education_clients"]
concat = pd.get_dummies(concat, columns=categorical)

train, test =  concat[concat.good_bad_flag.notnull()], concat[~concat.good_bad_flag.notnull()].drop(['good_bad_flag'],axis=1)
train, test = train.fillna(0), test.fillna(0)

train.to_csv(rf"merged_train.csv",sep=';',index=False)
test.to_csv(rf"merged_test.csv",sep=';',index=False)