In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/assignment_data.csv', sep=';')

In [3]:
ids = ['country_id','application_id','product_id','customer_id']
dates = ['due_date','first_status_day_date','first_status_time_of_day','paid_date','arrived_date','Variable_42','Variable_43','Variable_44']
categoricals = ['Variable_5','Variable_6','Variable_12','Variable_45']
ordinals = ['Variable_13','Variable_14']
numericals = [x for x in data.columns if (x not in ids) and (x not in dates) and (x not in categoricals) and (x not in ordinals) and x != 'Target']

# Categorical and Ordinals

In [4]:
def transform_categorical(dataset, columns_to_transform):
    """Translates categorical columns into integers."""

    categorical_dictionary = {}
    for column in columns_to_transform:
        categories = pd.Categorical(dataset[column])
        new_column = '{prefix}{suffix}'.format(prefix=column, suffix='_INT')
        dataset.loc[:, new_column] = categories.codes
        categorical_dictionary[column] = dict([(k, v) for v, k in enumerate(categories.categories)])
        
    return dataset, categorical_dictionary

In [5]:
data, categorical_dictionary = transform_categorical(data, categoricals)
data, ordinal_dictionary =  transform_categorical(data, ordinals)

In [6]:
data.shape

(2572, 61)

# Dates

In [7]:
for date in dates:
    data[date] = pd.to_datetime(data[date])

In [8]:
def transform_dates_day(dataset, columns_to_transform):
    """Translates dates columns into integers. Granularity: Year, Month, Day"""
    
    for column in columns_to_transform:
        new_column_year = '{prefix}{suffix}'.format(prefix=column, suffix='_YEAR')
        new_column_month = '{prefix}{suffix}'.format(prefix=column, suffix='_MONTH')
        new_column_day = '{prefix}{suffix}'.format(prefix=column, suffix='_DAY')
        
        dataset.loc[:, new_column_year] = dataset.loc[:, column].dt.year
        dataset.loc[:, new_column_month] = dataset.loc[:, column].dt.month
        dataset.loc[:, new_column_day] = dataset.loc[:, column].dt.day
        
    return dataset

In [9]:
def transform_dates_second(dataset, columns_to_transform):
    """Translates dates columns into integers. Granularity: Hour, Minute, Second"""
    
    for column in columns_to_transform:
        new_column_hour = '{prefix}{suffix}'.format(prefix=column, suffix='_HOUR')
        new_column_minute = '{prefix}{suffix}'.format(prefix=column, suffix='_MINUTE')
        new_column_second = '{prefix}{suffix}'.format(prefix=column, suffix='_SECOND')
        
        dataset.loc[:, new_column_hour] = dataset.loc[:, column].dt.hour
        dataset.loc[:, new_column_minute] = dataset.loc[:, column].dt.minute
        dataset.loc[:, new_column_second] = dataset.loc[:, column].dt.second
        
    return dataset

In [10]:
data = transform_dates_day(data, ['due_date','first_status_day_date','paid_date','arrived_date','Variable_42','Variable_43','Variable_44'])
data = transform_dates_second(data, ['first_status_time_of_day','arrived_date'])

In [11]:
data.shape

(2572, 88)

# Drop Transormed Variables

In [12]:
data = data.drop(categoricals, axis=1)
data = data.drop(ordinals, axis=1)
data = data.drop(dates, axis=1)

# Drop Constant Columns

In [13]:
def drop_constant_column(dataframe):
    """Drops constant value columns of pandas dataframe."""
    
    return dataframe.loc[:, (dataframe != dataframe.iloc[0]).any()]

In [14]:
data = drop_constant_column(data)

In [15]:
data.shape

(2572, 69)

In [16]:
for c in data.columns:
    if c in ids:
        print(c)

application_id
customer_id


# Drop Empty Columns  

# Inputing NaN

In [17]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Emptyness Percentage'])
missing_data.head(10)

Unnamed: 0,Total,Emptyness Percentage
Variable_42_YEAR,2463,0.957621
Variable_42_MONTH,2463,0.957621
Variable_42_DAY,2463,0.957621
Variable_43_MONTH,2424,0.942457
Variable_43_YEAR,2424,0.942457
Variable_43_DAY,2424,0.942457
Variable_44_DAY,2371,0.921851
Variable_44_MONTH,2371,0.921851
Variable_44_YEAR,2371,0.921851
Variable_38,1990,0.773717


In [18]:
for column in [x for x in data.columns if x != 'Target']:
    data[column] = data[column].fillna((data[column].mean()))

# Grab Ferratum Test set

In [19]:
ferratum_preprocessed_test_set = data[data.Target.isnull()]

In [20]:
ferratum_preprocessed_test_set.shape

(515, 69)

In [21]:
ferratum_preprocessed_test_set.to_csv('data/ferratum_preprocessed_test_set.csv', sep=';', index=False)

# Split in Train Test

In [22]:
data = data[data.Target.notnull()]

In [23]:
target = data['Target']

In [24]:
data.drop(['Target'], 1 ,inplace=True)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    data, 
    target, 
    test_size=0.25, 
    random_state=2019,
    stratify = target)

In [26]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1542, 68)
(1542,)
(515, 68)
(515,)


In [27]:
X_train.to_csv('data/preprocessed_train_set.csv', sep=';', index=False)
y_train.to_csv('data/train_target.csv', sep=';', index=False, header='Target')

X_test.to_csv('data/preprocessed_test_set.csv', sep=';', index=False)
y_test.to_csv('data/test_target.csv', sep=';', index=False,  header='Target')