In [3]:
from model import *

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:          16046         395       14117          20        1532       15275
Swap:             0           0           0


In [2]:
from lender_pipeline import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler

In [6]:
# conversion = lambda x: np.datetime64(str(x).replace(" -", "-").replace(" +", "+")) if x != "NA" else np.nan
          
ls = pd.read_csv('lender_dataset.csv')

In [7]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [8]:
ls_int=ls.select_dtypes(include=['int'])
converted_int = ls_int.apply(pd.to_numeric,downcast='unsigned')

In [9]:
print(mem_usage(ls_int))
print(mem_usage(converted_int))

1505.59 MB
471.90 MB


In [28]:
ls_float=ls.select_dtypes(include=['float'])
converted_float = ls_float.apply(pd.to_numeric,downcast='float')

In [29]:
print(mem_usage(ls_float))
print(mem_usage(converted_float))

494.37 MB
247.19 MB


In [11]:
cat = ls.FIRST_TIME_DEPOSITOR_REPORTING_CATEGORY.astype('category')

In [14]:
cat.head()

0      core
1      core
2    direct
3      core
4      core
Name: FIRST_TIME_DEPOSITOR_REPORTING_CATEGORY, dtype: category
Categories (4, object): [core, direct, kiva, none]

In [15]:
cat.head().cat.codes

0    0
1    0
2    1
3    0
4    0
dtype: int8

In [19]:
ls_obj = ls.select_dtypes(include=['object'])

In [20]:
ls_obj.columns

Index(['VINTAGE_DATE', 'FIRST_TRANSACTION_DATE', 'FIRST_DEPOSIT_DATE',
       'LAST_TRANSACTION_DATE', 'LAST_LOGIN_DATE',
       'FIRST_TIME_DEPOSITOR_REPORTING_CATEGORY', 'FIRST_TRANSACTION_REFERRAL',
       'IS_CORPORATE_CAMPAIGN_USER', 'IS_FREE_TRIAL_USER',
       'FIRST_BASKET_CATEGORY', 'USER_LOCATION_COUNTRY', 'USER_LOCATION_STATE',
       'USER_LOCATION_CITY', 'FIRST_LOAN_COUNTRY', 'FIRST_LOAN_REGION'],
      dtype='object')

In [23]:
converted_obj = pd.DataFrame()

for col in ls_obj.columns:
    num_unique_values = len(ls_obj[col].unique())
    num_total_values = len(ls_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = ls_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = ls_obj[col]

In [24]:
print(mem_usage(ls_obj))
print(mem_usage(converted_obj))

2586.42 MB
82.66 MB


In [26]:
optimized_ls = ls.copy()

In [27]:
optimized_ls[converted_int.columns] = converted_int

In [30]:
optimized_ls[converted_float.columns] = converted_float

In [31]:
optimized_ls[converted_obj.columns] = converted_obj

In [33]:
print(mem_usage(ls))
print(mem_usage(optimized_ls))

4589.20 MB
804.56 MB


In [34]:
optimized_ls.shape

(2945384, 105)

In [38]:
optimized_ls.dtypes

FUND_ACCOUNT_ID                                 uint32
LOGIN_ID                                        uint32
VINTAGE_YEAR                                    uint16
VINTAGE_MONTH                                   uint32
VINTAGE_DATE                                  category
FIRST_TRANSACTION_DATE                        category
FIRST_DEPOSIT_DATE                            category
LAST_TRANSACTION_DATE                         category
LAST_LOGIN_DATE                               category
ACTIVE_LIFETIME_MONTHS                         float32
ACCOUNT_AGE_MONTHS                               uint8
FIRST_TIME_DEPOSITOR_REPORTING_CATEGORY       category
FIRST_TRANSACTION_REFERRAL                    category
IS_CORPORATE_CAMPAIGN_USER                    category
IS_FREE_TRIAL_USER                            category
FIRST_BASKET_CATEGORY                         category
USER_LOCATION_COUNTRY                         category
USER_LOCATION_STATE                           category
USER_LOCAT

In [None]:
def convert_datetime(df, col_list=['VINTAGE_DATE',
                                   'FIRST_TRANSACTION_DATE',
                                   'FIRST_DEPOSIT_DATE',
                                   'LAST_TRANSACTION_DATE',
                                   'LAST_LOGIN_DATE']):
        for col in col_list:
            df[col] = pd.to_datetime(df[col])
        return df

In [42]:
dtypes = optimized_ls.dtypes

In [44]:
dtypes_col = dtypes.index

In [45]:
dtypes_type = [i.name for i in dtypes.values]

In [46]:
column_types = dict(zip(dtypes_col,dtypes_type))

In [47]:
column_types['VINTAGE_DATE']

'category'

In [49]:
optimized_ls_read = pd.read_csv('lender_dataset.csv',dtype=column_types)

In [52]:
import json
with open('column_types.json', 'w') as fp:
    json.dump(column_types, fp)

In [50]:
print(mem_usage(optimized_ls_read))

804.60 MB


In [None]:
#3m users dataset

In [16]:
ls.shape

(2945384, 105)

In [19]:
ls.isnull().sum()[ls.isnull().sum() != 0 ]

FIRST_TRANSACTION_DATE                        1100725
FIRST_DEPOSIT_DATE                            1818487
LAST_TRANSACTION_DATE                         1100736
ACTIVE_LIFETIME_MONTHS                        1100736
USER_LOCATION_COUNTRY                         1371015
USER_LOCATION_STATE                           1537399
USER_LOCATION_CITY                            1527001
LIFETIME_DEPOSIT_NUM                          1100725
LIFETIME_ACCOUNT_LOAN_PURCHASE_NUM            1100725
LIFETIME_PROXY_LOAN_PURCHASE_NUM              1196160
LIFETIME_DONATION_NUM                         1100725
LIFETIME_LENDER_WEIGHTED_AVERAGE_LOAN_TERM    1239052
CORE_LOAN_PURCHASE_NUM                        1196160
CORE_LOAN_PURCHASE_TOTAL                      1196160
DIRECT_LOAN_PURCHASE_NUM                      1196160
DIRECT_LOAN_PURCHASE_TOTAL                    1196160
FIRST_LOAN_PURCHASE_WEIGHTED_AVERAGE_TERM     1201919
NUMBER_OF_LOANS_IN_FIRST_LOAN_CHECKOUT        1201911
NUMBER_OF_FIRST_LOANS_STILL_

In [5]:
ls = drop_outliers(ls,outlier_index_lst=[2987, 27627, 15038, 19433, 704])

In [6]:
ls.shape

(51203, 105)

In [12]:
df, X = feature_engineer(ls)

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [18]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:           3950         671        2677          22         601        2954
Swap:             0           0           0


In [None]:
# from numpy.linalg import svd

In [None]:
# U,S,V_t = svd(X) #memory error

In [None]:
!free -m

In [23]:
re_X, imp_f = PCA_reduce(X,3)

#0 Principle Component top 5 contributors are: [ 9 76 75 60 13]
#1 Principle Component top 5 contributors are: [ 25 241  27 112  93]
#2 Principle Component top 5 contributors are: [17 18  6 14  5]
Reduced X shape: (51203, 3)


In [24]:
counter = print_imp_features(df,imp_f)

Counter({'CORE_LOAN_PURCHASE_TOTAL': 1, 'NUM_BUNDLE_UNDERFUNDED_LOANS': 1, 'FIRST_YEAR_LOAN_PURCHASE_TOTAL': 1, 'FIRST_BASKET_CATEGORY_Promo Lender': 1, 'FIRST_TIME_DEPOSITOR_REPORTING_CATEGORY_none': 1, 'NUM_BUNDLE_RURAL_LOANS': 1, 'LIFETIME_ACCOUNT_LOAN_PURCHASE_NUM': 1, 'FIRST_YEAR_DONATION_OCCASION_NUM': 1, 'NUM_SECTOR_FOOD_LOANS': 1, 'FIRST_YEAR_DEPOSIT_TOTAL': 1, 'ACTIVE_LIFETIME_MONTHS_log': 1, 'CORE_LOAN_PURCHASE_NUM': 1, 'LIFETIME_DEPOSIT_TOTAL': 1, 'FIRST_YEAR_DEPOSIT_OCCASION_NUM': 1, 'LIFETIME_ACCOUNT_LOAN_PURCHASE_TOTAL': 1})
