In [15]:
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from itertools import combinations
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

In [16]:
data = pd.read_csv('data/assignment_data.csv', sep=';')

In [17]:
data.shape

(2572, 55)

In [18]:
ids = ['country_id','application_id','product_id','customer_id']
dates = [
    'due_date',
    'first_status_day_date',
    'first_status_time_of_day',
    'paid_date',
    'arrived_date',
    'Variable_42',
    'Variable_43',
    'Variable_44']

categoricals = ['Variable_5','Variable_6','Variable_12','Variable_45']

ordinals = ['Variable_13','Variable_14']
numericals = [
    x for x in data.columns if 
    (x not in ids) and 
    (x not in dates) and 
    (x not in categoricals) and 
    (x not in ordinals) and 
    x != 'Target']

In [19]:
dates

['due_date',
 'first_status_day_date',
 'first_status_time_of_day',
 'paid_date',
 'arrived_date',
 'Variable_42',
 'Variable_43',
 'Variable_44']

In [20]:
data[['Variable_13','Variable_14']].head(10)

Unnamed: 0,Variable_13,Variable_14
0,M,RATINGSTUFE M
1,M,RATINGSTUFE M
2,G,RATINGSTUFE G
3,D,RATINGSTUFE D
4,,
5,K,RATINGSTUFE K
6,G,RATINGSTUFE G
7,I,RATINGSTUFE I
8,F,RATINGSTUFE F
9,M,RATINGSTUFE M


In [21]:
def stripper(value):
    value = str(value).split()
    if len(value)>1:
        return value[1]
    elif value[0]=='nan':
        return pd.np.nan
    else:
        print('error')
test = data['Variable_14'].apply(stripper , 1)

In [22]:
print(data[(data.Variable_13 != test) & (data.Variable_13.notnull()) & (test.notnull())] [['Variable_13','Variable_14']].shape)
print(data[(data.Variable_13 != test) & (test.notnull())] [['Variable_13','Variable_14']].shape)
print(data[(data.Variable_13 != test) & (data.Variable_13.notnull())] [['Variable_13','Variable_14']].shape)

(0, 2)
(0, 2)
(0, 2)


In [23]:
# Remove Variable 14 and move this analysis in notebook one

# Categorical and Ordinals

In [24]:
def transform_categorical(dataset, columns_to_transform):
    """Translates categorical columns into integers."""

    categorical_dictionary = {}
    for column in columns_to_transform:
        categories = pd.Categorical(dataset[column])
        new_column = '{prefix}{suffix}'.format(prefix=column, suffix='_INT')
        dataset.loc[:, new_column] = categories.codes
        categorical_dictionary[column] = dict([(k, v) for v, k in enumerate(categories.categories)])
        
    return dataset, categorical_dictionary

def transform_ordinals(dataset, columns_to_transform, mapper):
    """Translates categorical columns into integers."""

    categorical_dictionary = {}
    for column in columns_to_transform:
        new_column = '{prefix}{suffix}'.format(prefix=column, suffix='_INT')
        dataset.loc[:, new_column] = dataset[column].map(mapper[column])
    return dataset

ordinal_dictionary ={
    'Variable_13': {
        'A': 0,
        'B': 1,
        'C': 2,
        'D': 3,
        'E': 4,
        'F': 5,
        'G': 6,
        'H': 7,
        'I': 8,
        'J': 9, # Added manually
        'K': 10,
        'L': 11,
        'M': 12},
        
    'Variable_14': {
        'RATINGSTUFE A': 0,
        'RATINGSTUFE B': 1,
        'RATINGSTUFE C': 2,
        'RATINGSTUFE D': 3,
        'RATINGSTUFE E': 4,
        'RATINGSTUFE F': 5,
        'RATINGSTUFE G': 6,
        'RATINGSTUFE H': 7,
        'RATINGSTUFE I': 8,
        'RATINGSTUFE J': 8, # Addedd manually
        'RATINGSTUFE K': 9,
        'RATINGSTUFE L': 10,
        'RATINGSTUFE M': 11}
    }

In [25]:
# Discard outiers
data = data[data.Variable_45 !='?']

In [26]:
data, categorical_dictionary = transform_categorical(data, categoricals)
data =  transform_ordinals(data, ordinals, ordinal_dictionary)

In [27]:
data.shape

(2565, 61)

In [28]:
categorical_dictionary = pickle.dump(categorical_dictionary, open('data/categorical_dictionary.pkl','wb'))
ordinal_dictionary = pickle.dump(categorical_dictionary, open('data/ordinal_dictionary.pkl','wb'))

# Dates

In [29]:
for date in dates:
    data[date] = pd.to_datetime(data[date])

In [30]:
def transform_dates_day(dataset, columns_to_transform):
    """Translates dates columns into integers. Granularity: Year, Month, Day"""
    
    for column in columns_to_transform:
        new_column_year = '{prefix}{suffix}'.format(prefix=column, suffix='_YEAR')
        new_column_month = '{prefix}{suffix}'.format(prefix=column, suffix='_MONTH')
        new_column_day = '{prefix}{suffix}'.format(prefix=column, suffix='_DAY')
        
        dataset.loc[:, new_column_year] = dataset.loc[:, column].dt.year
        dataset.loc[:, new_column_month] = dataset.loc[:, column].dt.month
        dataset.loc[:, new_column_day] = dataset.loc[:, column].dt.day
        
    return dataset

In [31]:
def transform_dates_second(dataset, columns_to_transform):
    """Translates dates columns into integers. Granularity: Hour, Minute, Second"""
    
    for column in columns_to_transform:
        new_column_hour = '{prefix}{suffix}'.format(prefix=column, suffix='_HOUR')
        new_column_minute = '{prefix}{suffix}'.format(prefix=column, suffix='_MINUTE')
        new_column_second = '{prefix}{suffix}'.format(prefix=column, suffix='_SECOND')
        
        dataset.loc[:, new_column_hour] = dataset.loc[:, column].dt.hour
        dataset.loc[:, new_column_minute] = dataset.loc[:, column].dt.minute
        dataset.loc[:, new_column_second] = dataset.loc[:, column].dt.second
        
    return dataset

In [32]:
data = transform_dates_day(data, ['due_date','first_status_day_date','paid_date','arrived_date','Variable_42','Variable_43','Variable_44'])
data = transform_dates_second(data, ['first_status_time_of_day','arrived_date'])

In [33]:
data.shape

(2565, 88)

### Feature Engineering: Adding distance between dates 

In [34]:
for couple in combinations(['due_date','first_status_day_date','paid_date','arrived_date','Variable_42','Variable_43','Variable_44'],2):
    columnn_name = "DAYS_{}-{}".format(couple[0],couple[1])
    days = (data[couple[0]] - data[couple[1]]).dt.days
    data[columnn_name] = days

In [35]:
data.shape

(2565, 109)

# Numerical

### Feature engineering: Adding ratios of couples of numerical

for couple in combinations(numericals,2):
    columnn_name = "RATIO_{}-{}".format(couple[0],couple[1])
    ratio = (data[couple[0]].divide(data[couple[1]]))
    data[columnn_name] = ratio

In [36]:
data.shape

(2565, 109)

# Drop Transformed Variables

In [37]:
data = data.drop(categoricals, axis=1)
data = data.drop(ordinals, axis=1)
data = data.drop(dates, axis=1)

# Drop Constant Columns

In [38]:
def drop_constant_column(dataframe):
    """Drops constant value columns of pandas dataframe."""
    
    return dataframe.loc[:, (dataframe != dataframe.iloc[0]).any()]

In [39]:
data = drop_constant_column(data)

In [40]:
data.shape

(2565, 90)

In [41]:
for c in data.columns:
    if c in ids:
        print(c)

application_id
customer_id


# Drop Empty Columns  

In [42]:
empty_columns_to_drop = []
empty_threshold_to_be_dropped = 90
for column in [x for x in data.columns if x != 'Target']:
    emptyness = float(data[column].isnull().sum() * 100) /data.shape[0]
    if emptyness >= empty_threshold_to_be_dropped:
        empty_columns_to_drop.append(column)
        
data = data.drop(empty_columns_to_drop, axis=1)

In [43]:
print('Dropping {} columns'.format(len(empty_columns_to_drop)))

Dropping 24 columns


In [44]:
data.shape

(2565, 66)

# Save datasets before inputing NaN for XGBoost

In [45]:
temp_data = data[data.Target.notnull()]
target = temp_data['Target']

X_train, X_test, y_train, y_test = train_test_split(
    temp_data.drop(['Target'], 1),
    target, 
    test_size=0.25, 
    random_state=2019,
    stratify = target)

In [46]:
#data.to_csv('data/preprocessed_entire_set.csv', sep=';', index=False)
#target.to_csv('data/entire_target.csv', sep=';', index=False, header='Target')

X_train.to_csv('data/preprocessed_train_set_NAN.csv', sep=';', index=False)
y_train.to_csv('data/train_target_NAN.csv', sep=';', index=False, header='Target')

X_test.to_csv('data/preprocessed_test_set_NAN.csv', sep=';', index=False)
y_test.to_csv('data/test_target_NAN.csv', sep=';', index=False,  header='Target')

# Inputing NaN

In [47]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Emptyness Percentage'])
missing_data.head(10)

Unnamed: 0,Total,Emptyness Percentage
Variable_21,1985,0.773879
Variable_38,1985,0.773879
Variable_22,1754,0.683821
Variable_39,1754,0.683821
Variable_37,1099,0.42846
Variable_20,1099,0.42846
Target,515,0.20078
Variable_26,184,0.071735
Variable_27,184,0.071735
Variable_28,184,0.071735


In [48]:
for column in [x for x in data.columns if x != 'Target']:
    data[column] = data[column].replace([np.inf, -np.inf], np.nan)
    data[column] = data[column].fillna((data[column].mean()))
    #data[column] = data[column].fillna(0)

# MinMaxScale

In [49]:
data.shape

(2565, 66)

In [50]:
data.shape

(2565, 66)

# Grab Ferratum Test set

In [51]:
ferratum_preprocessed_test_set = data[data.Target.isnull()]

In [52]:
ferratum_preprocessed_test_set.shape

(515, 66)

In [53]:
ferratum_preprocessed_test_set.to_csv('data/ferratum_preprocessed_test_set.csv', sep=';', index=False)

# Split in Train Test

In [54]:
data = data[data.Target.notnull()]

##### Undersampling
print(data[data.Target ==1].shape)
print(data[data.Target ==0].shape)
new_data = data[data.Target ==1].sample(633)
new_data = new_data.append(data[data.Target ==0])
data = new_data.copy()

In [55]:
target = data['Target']

In [56]:
data.drop(['Target'], 1 ,inplace=True)

In [57]:
data.shape

(2050, 65)

In [58]:
##### Oversampling
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE 
sm = SVMSMOTE(random_state=2019, k_neighbors=10)
temp_data, temp_target = sm.fit_resample(data, target)
data = pd.DataFrame(temp_data, columns=data.columns)
target = pd.DataFrame(temp_target, columns=['Target'])

ModuleNotFoundError: No module named 'imblearn'

In [59]:
data.shape

(2050, 65)

In [60]:
target.sum()

1419.0

In [85]:
X_train, X_test, y_train, y_test = train_test_split(
    data, 
    target, 
    test_size=0.25, 
    random_state=2019,
    stratify = target)

In [86]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2136, 65)
(2136, 1)
(712, 65)
(712, 1)


In [87]:
data.to_csv('data/preprocessed_entire_set.csv', sep=';', index=False)
target.to_csv('data/entire_target.csv', sep=';', index=False, header='Target')

X_train.to_csv('data/preprocessed_train_set.csv', sep=';', index=False)
y_train.to_csv('data/train_target.csv', sep=';', index=False, header='Target')

X_test.to_csv('data/preprocessed_test_set.csv', sep=';', index=False)
y_test.to_csv('data/test_target.csv', sep=';', index=False,  header='Target')