In [1]:
#improting all the required packages
import csv
import os
import math
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
#function to check is directory exists
def funCheckDir(filepath):
    directory = os.path.dirname(filepath) # defining directory path
    if not os.path.exists(directory): # checking if directory already exists
        os.makedirs(directory) # making a directory
        return False
    else :
        try:
            os.remove(filepath)
        except OSError:
            pass
        return True

In [3]:
#function to drop blank rows & rows with comments
def funDropRows(df):
    df.dropna(thresh = 2, inplace = True)
    return df

In [4]:
def funDropColumns(df):
    #columns to be selected
    selected_colums = ['Amount Requested', 'Application Date', 'Loan Title', 'Risk_Score', 'Debt-To-Income Ratio', 
                       'State', 'Employment Length']
    df = df[selected_colums]
    return df

In [5]:
def funCleanData(df):
    
    #extracting number from emp_length and convertinng into int
    df['Employment Length'] = df['Employment Length'].str.extract('(\d+)')
    df['Employment Length'] = df['Employment Length'].fillna(0).astype(int)
    
    #removing '%' from Debt-To-Income Ratio and converting into float
    df['Debt-To-Income Ratio'] = df['Debt-To-Income Ratio'].apply(lambda x: float(x.rstrip("%")))
    
    return df

In [6]:
def funDeriveColumns(df):
    
    #deriving issue_month and issue_year from the issue_d column
    df['Application Month'] = df['Application Date'].dt.strftime('%b')
    df['Application Year'] =  df['Application Date'].dt.year
    df['quarter'] =  df['Application Date'].dt.quarter
    df['period'] = df['Application Year'].map(str) + '_Q' +df['quarter'].map(str)
    del df['quarter']
    del df['Application Date']
     
    return df

In [7]:
def funFillMissingData(df):
    #filling NaN with default
    df['Loan Title'] = df['Loan Title'].fillna('Unknown')
    
    df['Risk_Score'] = df['Risk_Score'].fillna(111).astype(int)
    df['Risk_Score'].replace(0, 111,inplace=True)
    
    return df

In [8]:
def funRenameColumns(df):
    #renaming columns
    columns = ['LoanAmt', 'Purpose', 'FICO', 'DTI', 'State', 'EmpLength', 'IssuedMonth', 'IssuedYear', 'Period']
    df.columns = columns
    
    return df

In [9]:
#function to write the data in chunks
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [10]:
#defining the file-directory
fileDir = os.path.dirname(os.path.realpath('__file__'))

In [11]:
columns = ['IssuedYear',  'IssuedMonth', 'LoanAmt', 'EmpLength', 'Purpose', 'State', 'DTI', 'FICO']

In [12]:
#reading loan data stats
for directory, subdirectory, filenames in  os.walk(fileDir + '/data/raw_data/declined_loan_data/'):
    for filename in filenames:
        print("Working on file: " + filename + '....')
        #reading data from CSV
        df = pd.read_csv(os.path.join(directory, filename), skiprows=1, parse_dates=['Application Date'])
        df = funDropRows(df)
        df = funDropColumns(df)
        df = funCleanData(df)
        df = funFillMissingData(df)
        df = funDeriveColumns(df)
        df = funRenameColumns(df)
        
        files = df.Period.unique()
        for file in files:
            loanFilePath = fileDir+'/data/processed_data/declined_loan_data/'+file+'.csv'
            funCheckDir(loanFilePath)
            df_subset = df[df['Period'] == file]
            withHeaders = True
            for i in chunker(df_subset[columns],10000):
                if(withHeaders):
                    i.to_csv(loanFilePath, index=False, mode='a')
                    withHeaders = False
                else:
                    i.to_csv(loanFilePath, index=False, mode='a', header = False)

Working on file: RejectStatsA.csv....
Working on file: RejectStatsB.csv....
Working on file: RejectStatsD.csv....
Working on file: RejectStats_2016Q1.csv....
Working on file: RejectStats_2016Q2.csv....
Working on file: RejectStats_2016Q3.csv....
Working on file: RejectStats_2016Q4.csv....
