# Lending Club Dataset

## Feature pipeline

In [None]:
!pip uninstall hopsworks -y
!pip install hopsworks -q

### Colab-Specific cell

We need to download the feature group modules (loans.py, applicants.py) in the features package (directory.

In [None]:
import os

# Hosted notebook environments may not have the local features package
def need_download_modules():
    if 'google.colab' in str(get_ipython()):
        return True
    if 'HOPSWORKS_PROJECT_ID' in os.environ:
        return True
    return False

if need_download_modules():
    print("Downloading modules")
    os.system('mkdir -p features')
    os.system('cd features && wget https://raw.githubusercontent.com/jimdowling/hopsworks-tutorials/loan_approval/loan_approval/features/loans.py')
    os.system('cd features && wget https://raw.githubusercontent.com/jimdowling/hopsworks-tutorials/loan_approval/loan_approval/features/applicants.py')
else:
    print("Local environment")

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

In [None]:
loans_df = pd.read_parquet("https://repo.hops.works/dev/jdowling/loans.parquet")
loans_df.head()

In [None]:
for month in loans_df.issue_d.unique():
    loans_month_df = loans_df.loc[loans_df['issue_d'] == month]
    loans_month_df.to_csv("data/loans/loans-{}.csv".format(month))

In [None]:
loans_df.describe()

In [None]:
loans_df.info()

In [None]:
applicants_df = pd.read_parquet("https://repo.hops.works/dev/jdowling/applicants.parquet")
applicants_df.head()

In [None]:
applicants_df.earliest_cr_line.unique()

for month in applicants_df.earliest_cr_line.unique():
    applicants_month_df = applicants_df.loc[applicants_df['earliest_cr_line'] == month]
    applicants_month_df.to_csv("data/applicants/applicants-{}.csv".format(month))

In [None]:
applicants_df.info()

# Feature Engineering

In [None]:
from features import loans, applicants

import timeit
start = timeit.timeit()

applicants_df['home_ownership'] = applicants_df.home_ownership.apply(applicants.home_ownership)
applicants_df.home_ownership.value_counts()

end = timeit.timeit()
print(end - start)

In [None]:
applicants_df.home_ownership.value_counts()

In [None]:
applicants_df['earliest_cr_line'] = pd.to_datetime(applicants_df['earliest_cr_line'])
loans_df['issue_d'] = pd.to_datetime(loans_df['issue_d'])

## ✔️ `dti`, `open_acc`, `revol_bal`, `revol_util`, & `total_acc`

> - `dti`: A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
> - `open_acc`: The number of open credit lines in the borrower's credit file.
> - `revol_bal`: Total credit revolving balance
> - `revol_util`: Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.
> - `total_acc`: The total number of credit lines currently in the borrower's credit file

In [None]:
applicants_df['pub_rec'] = applicants_df.pub_rec.apply(applicants.pub_rec)
applicants_df['pub_rec_bankruptcies'] = \
    applicants_df.pub_rec_bankruptcies.apply(applicants.pub_rec_bankruptcies)

# 🔄 Data PreProcessing

**Section Goals:** 
> - Remove or fill any missing data. 
> - Remove unnecessary or repetitive features. 
> - Convert categorical string features to dummy variables.

Realistically there are too many unique job titles to try to convert this to a dummy variable feature. Let's remove that emp_title column.

In [None]:
applicants_df.drop('emp_title', axis=1, inplace=True)

Charge off rates are extremely similar across all employment lengths. So we are going to drop the `emp_length` column.

In [None]:
applicants_df.drop('emp_length', axis=1, inplace=True)

The title column is simply a string subcategory/description of the purpose column. So we are going to drop the title column.

In [None]:
loans_df.drop('title', axis=1, inplace=True)

### `mort_acc`

There are many ways we could deal with this missing data. We could attempt to build a simple model to fill it in, such as a linear model, we could just fill it in based on the mean of the other columns, or you could even bin the columns into categories and then set NaN as its own category. There is no 100% correct approach! 

Let's try the fillna() approach. We will group the dataframe by the total_acc and calculate the mean value for the mort_acc per total_acc entry. To get the result below:

In [None]:
total_acc_avg = applicants.mean_mort_acc(applicants_df)

In [None]:
applicants_df['mort_acc'] = applicants_df.apply(lambda x: applicants.fill_mort_acc(x['total_acc'], 
                                                        x['mort_acc'], total_acc_avg), axis=1)

### `revol_util` & `pub_rec_bankruptcies`
These two features have missing data points, but they account for less than 0.5% of the total data. So we are going to remove the rows that are missing those values in those columns with dropna().

In [None]:
applicants_df.dropna(inplace=True)

## 🧮 Categorical Variables

### `term`

### `grade` & `sub_grade`

We know that `grade` is just a sub feature of `sub_grade`, So we are goinig to drop it.

In [None]:
loans_df.drop('grade', axis=1, inplace=True)

### `address`
We are going to feature engineer a zip code column from the address in the data set. Create a column called 'zip_code' that extracts the zip code from the address column.

In [None]:
loans_df['zip_code'] = loans_df.apply(lambda x: loans.zipcode(x['address'][-5:]), axis=1)

In [None]:
loans_df.zip_code.value_counts()

In [None]:
loans_df.drop('address', axis=1, inplace=True)

### `issue_d` 

This is the event_time for the loan being issued

In [None]:
loans_df.info()

### `earliest_cr_line`
This appears to be a historical time stamp feature. Extract the year from this feature using a `.apply()` function, then convert it to a numeric feature.

In [None]:
applicants_df['earliest_cr_line_year'] = applicants_df.apply(
    lambda x: applicants.earliest_cr_line(x['earliest_cr_line']), axis=1)

In [None]:
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

expectation_suite = ExpectationSuite(
    expectation_suite_name="transaction_suite")

expectation_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column":"int_rate", 
            "min_value":"-2.0",
            "max_value":"2000.0",
        }
    )
)

# TODO: Add more expectations here to improve data validation checks

## 🔖 Hopsworks Feature Store

In [None]:
import hopsworks

project = hopsworks.login()
fs = project.get_feature_store()

In [None]:
loans_fg = fs.get_or_create_feature_group(name="loans",
                                    version=1,
                                    description="Lending Club Loans",
                                    online_enabled=True,
                                    primary_key=['id'],
                                    event_time='issue_d',
                                    expectation_suite=expectation_suite
)

In [None]:
applicants_fg = fs.get_or_create_feature_group(name="applicants",
                                    version=1,
                                    description="Lending Club Loan Applicants",
                                    online_enabled=True,
                                    primary_key=['id'],
                                    partition_key=['earliest_cr_line_year'],
                                    event_time='earliest_cr_line'
)

### Configure upload batch size for performance (latency vs throughput)

```
loans_fg.insert(loans_df, write_options={"wait_for_job" : False},         
    "kafka_producer_config": {
        "linger.ms": 20,
        "batch.size": 1000000,
        "acks": 1,
        "max.in.flight.requests.per.connection": 5,
        "message.max.bytes": 2000000,
        "batch.num.messages": 200000,
        "buffer.memory": 335544320,
        "queue.buffering.max.messages": 10000000,
        "debug": "broker,topic,msg,queue"
        } 
```

In [None]:
import time
start_time = time.time()
loans_fg.insert(loans_df, write_options={"wait_for_job" : False})
print("Upload time %s seconds ---" % (time.time() - start_time))

In [None]:
applicants_fg.insert(applicants_df, write_options={"wait_for_job" : False})

In [None]:
metadata = pd.read_csv("https://repo.hops.works/dev/jdowling/LCDataDictionary.csv")
metadata

## Update the description of any features found in the data dictionary

Loop through the datadict. For each entry, if there is a corresponding feature in the feature group, update its description

In [None]:
datadict=[]

# For each (name, description) pair in LCDataDictionary.csv, we try and set the feature 
# description for loans_fg and applicants_fg. If the feature doesn't exist in the feature group
# an exception is thrown, and we "do nothing"
for entry in metadata.index: 
        name = metadata['LoanStatNew'][entry]
        try:
            f = loans_fg.get_feature(name)
            loans_fg.update_feature_description(name, metadata['Description'][entry])
            print("Updating description of feature: {}".format(f.name))
        except:
            pass # do nothing

        try:
            f = applicants_fg.get_feature(name)
            applicants_fg.update_feature_description(name, metadata['Description'][entry])
            print("Updating description of feature: {}".format(f.name))
        except:
            pass # do nothing
