In [2]:
import os
import sqlite3
from pathlib import Path
import pandas as pd
import shutil

#gather data from a specific csv file and return as a pandas df
BASE = Path.cwd()
DATA = BASE / "training_data"
#Make sure training_data folder exists
DATA.mkdir(parents= True, exist_ok= True)

In [None]:
#TO USE THIS API you must have a .kaggle folder in your 'C:\NAME' directory -> then paste the kaggle.json authenticator
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

print(api.dataset_list_files('wordsforthewise/lending-club').files)

api.dataset_download_files('wordsforthewise/lending-club', path= DATA, unzip=True)

In [None]:
def retrieve_training_csv(): 
    """ Function that returns all the csv files in the training data folder as a dataframe object """ 
    csv_list = list(DATA.glob("**/*.csv")) 
    return_list = [] 
    print(csv_list)

    #traverse each item inthe data path, and if a file ending in .csv is found, turn it into a df and append to return list 
    for item in csv_list: 
        if os.path.isfile(item): 
            print(f"{item} is a file") 
            return_list.append(pd.read_csv(item)) 
        else: print(f"{item} is folder or dir")

    if return_list: 
        return return_list 
    else: 
        print(f"No csv files found in {DATA}") 
        return []

def get_dir_size(path): 
    """ Get directory size in MBs """ 
    total = 0 
    for dirpath, dirs, files in os.walk(path):
        for f in files:
            file_path = os.path.join(dirpath, f)
            
            try:
                total += os.path.getsize(file_path) 
                print(f"{total/1000000} MB")
            except: 
                continue 
        return total/1000000
    
def delete_large_files(): 
    """ Function that deletes the large files downloaded from kaggle to save space 
    Deletes files to prevent any storage errors when pushing code to github """

    for paths in DATA.glob("**/*"): 
        if os.path.isdir(paths): 
            try:
                size = get_dir_size(paths) 
                print(f"size of {paths} is {size}")

                if size > 100: 
                    print(f"deleting {paths}") 
                    shutil.rmtree(paths) 

            except Exception as e:
                print(f"Error when trying to get size or delete file: {e}")
        else:
            if paths.suffix == ".gz":
                os.remove(paths)
                #print(paths)
        

df_list = retrieve_training_csv()
delete_large_files()

In [5]:
#create different pointers to the csvs in the list for easier access
raw_accepted_loans = df_list[0]
raw_rejected_loans = df_list[1]
raw_loans_paid_info = df_list[2]

In [None]:
raw_accepted_loans.head(10)

In [None]:
al_columns = list(raw_accepted_loans.columns)
print(al_columns)
raw_accepted_loans['loan_status'].head(10)
num_cols = [
    "id", "loan_amnt", "funded_amnt", "term", "int_rate", "installment", "annual_inc",
    "dti", "delinq_2yrs", "fico_range_low", "fico_range_high", "inq_last_6mths",
    "open_acc", "revol_bal", "revol_util", "total_acc", "pub_rec_bankruptcies"
]

text_cols = [
    "home_ownership", "loan_status", "purpose", "application_type", "verification_status"
]


"""
raw_specific_al = raw_accepted_loans[target_columns]
raw_specific_al.head(10).T
"""

**Clean Accepted Loans Data**
- Provide a data summary
- Standardize all data in the columns to reasonable types (ex: object -> float)
- Clean up any bad data (missing values, duplicates, etc)


In [8]:
#only find columns with valid ids (not string or empty)
cleaned_accepted_loans = raw_accepted_loans[num_cols + text_cols]
cleaned_accepted_loans = cleaned_accepted_loans[pd.to_numeric(cleaned_accepted_loans['id'], errors='coerce').notna()]

#convert id column to int instead of obj
cleaned_accepted_loans['id'] = cleaned_accepted_loans['id'].astype('int64')

In [9]:
#fix the the term column to make it float (and in months)
cleaned_accepted_loans.rename(columns = {'term': 'term_months'}, inplace = True)
num_cols[3] = 'term_months'
cleaned_accepted_loans['term_months'] = cleaned_accepted_loans['term_months'].str.replace(" months", "")
cleaned_accepted_loans['term_months'] = cleaned_accepted_loans['term_months'].astype('int64')

In [None]:
cleaned_accepted_loans.isna().sum()

In [None]:
#find any na rows and fill them as needed
for rows in num_cols:
    cleaned_accepted_loans.loc[:, rows] = cleaned_accepted_loans[rows].fillna(cleaned_accepted_loans[rows].median())

cleaned_accepted_loans.isna().sum()


In [None]:
#Get a sample of the cleaned up 
sampled_accepted_loans = cleaned_accepted_loans.sample(200000, random_state = 821)
sampled_accepted_loans

**Clean Rejected Loan Data**

In [None]:
cleaned_rejected_loans = raw_rejected_loans.copy()
cleaned_rejected_loans.isna().sum()

In [None]:
#Drop unneeded columns 
cleaned_rejected_loans = cleaned_rejected_loans.drop(columns=['Zip Code', 'State', 'Policy Code'])
rl_cols = list(cleaned_rejected_loans.columns)
print(rl_cols)

cleaned_rejected_loans.dtypes

In [None]:
print(type(cleaned_rejected_loans['Amount Requested'].dtype))

In [None]:
#Fix any missing values in the rejection loans
for rows in rl_cols:
    #fix the rows that are of float type
    if cleaned_rejected_loans[rows].dtypes == 'float64':
        print(f"replacing {rows} with median float")
        cleaned_rejected_loans.loc[:, rows] = cleaned_rejected_loans[rows].fillna(cleaned_rejected_loans[rows].median())
    else:
        #get the highest repeated string and replace N/A's with that string
        print(f"replacing {rows} na's with most repeated string")
        replacement_string = cleaned_rejected_loans[rows].value_counts().reset_index().at[0,rows]
        cleaned_rejected_loans.loc[:, rows] = cleaned_rejected_loans[rows].fillna(replacement_string)

In [None]:
cleaned_rejected_loans.isna().sum()