## Importing the pakages

In [1]:
import numpy as np

In [2]:
np.set_printoptions(suppress=True, linewidth=100, precision=2)

## Importing the data

In [3]:
# loadtxt() will generate error
# raw_data_np = np.loadtxt("loan-data.csv", delimiter=";")

raw_data_np = np.genfromtxt("loan-data.csv", delimiter=";", skip_header=1, autostrip=True)
raw_data_np

array([[48010226.  ,         nan,    35000.  , ...,         nan,         nan,     9452.96],
       [57693261.  ,         nan,    30000.  , ...,         nan,         nan,     4679.7 ],
       [59432726.  ,         nan,    15000.  , ...,         nan,         nan,     1969.83],
       ...,
       [50415990.  ,         nan,    10000.  , ...,         nan,         nan,     2185.64],
       [46154151.  ,         nan,         nan, ...,         nan,         nan,     3199.4 ],
       [66055249.  ,         nan,    10000.  , ...,         nan,         nan,      301.9 ]])

## Checking for incomplete data

In [4]:
np.isnan(raw_data_np).sum()

88005

In [5]:
# nanmax() returns maximum value of an array ignoring NaN values

temporary_fill = np.nanmax(raw_data_np) + 1
temporary_mean = np.nanmean(raw_data_np, axis=0)

  temporary_mean = np.nanmean(raw_data_np, axis=0)


In [6]:
# any column with NaN mean contains no number

temporary_mean

array([54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,
            440.92,         nan,         nan,         nan,         nan,         nan,     3143.85])

In [7]:
temporary_stats = np.array([np.nanmin(raw_data_np, axis=0),
                           temporary_mean,
                           np.nanmax(raw_data_np, axis=0)])

temporary_stats

  temporary_stats = np.array([np.nanmin(raw_data_np, axis=0),
  np.nanmax(raw_data_np, axis=0)])


array([[  373332.  ,         nan,     1000.  ,         nan,     1000.  ,         nan,        6.  ,
              31.42,         nan,         nan,         nan,         nan,         nan,        0.  ],
       [54015809.19,         nan,    15273.46,         nan,    15311.04,         nan,       16.62,
             440.92,         nan,         nan,         nan,         nan,         nan,     3143.85],
       [68616519.  ,         nan,    35000.  ,         nan,    35000.  ,         nan,       28.99,
            1372.97,         nan,         nan,         nan,         nan,         nan,    41913.62]])

## Splitting the dataset

### Splitting the columns

In [8]:
# argwhere() function is used to find the indices of array elements that are non-zero

# squeeze() function is used when we want to remove single-dimensional entries from the shape of an array

columns_strings = np.argwhere(np.isnan(temporary_mean)).squeeze()
columns_strings

array([ 1,  3,  5,  8,  9, 10, 11, 12], dtype=int64)

In [9]:
columns_numeric = np.argwhere(np.isnan(temporary_mean)==False).squeeze()
columns_numeric

array([ 0,  2,  4,  6,  7, 13], dtype=int64)

## Re-importing the dataset

In [10]:
# the string dataset

loan_data_string = np.genfromtxt("loan-data.csv", 
                                delimiter=";", 
                                skip_header=1, 
                                autostrip=True,
                                usecols = columns_strings, 
                                dtype=np.dtype(str))
loan_data_string

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

In [11]:
# the numeric dataset

loan_data_numeric = np.genfromtxt("loan-data.csv", 
                                delimiter=";", 
                                skip_header=1, 
                                autostrip=True,
                                usecols = columns_numeric, 
                                filling_values=temporary_fill)
loan_data_numeric

array([[48010226.  ,    35000.  ,    35000.  ,       13.33,     1184.86,     9452.96],
       [57693261.  ,    30000.  ,    30000.  , 68616520.  ,      938.57,     4679.7 ],
       [59432726.  ,    15000.  ,    15000.  , 68616520.  ,      494.86,     1969.83],
       ...,
       [50415990.  ,    10000.  ,    10000.  , 68616520.  , 68616520.  ,     2185.64],
       [46154151.  , 68616520.  ,    10000.  ,       16.55,      354.3 ,     3199.4 ],
       [66055249.  ,    10000.  ,    10000.  , 68616520.  ,      309.97,      301.9 ]])

## The names of the columns

In [12]:
loan_data_columns = np.genfromtxt("loan-data.csv", 
                                delimiter=";", 
                                skip_footer=raw_data_np.shape[0], 
                                autostrip=True,
                                dtype=np.dtype(str))
loan_data_columns

array(['id', 'issue_d', 'loan_amnt', 'loan_status', 'funded_amnt', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'verification_status', 'url', 'addr_state',
       'total_pymnt'], dtype='<U19')

In [13]:
# columns with string data and numeric data separately

columns_string, columns_numeric = loan_data_columns[columns_strings], loan_data_columns[columns_numeric]

In [14]:
columns_string

array(['issue_d', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

In [15]:
columns_numeric

array(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'total_pymnt'], dtype='<U19')

## Creating checkpoints

In [16]:
def checkpoint(file_name, checkpoint_header, checkpoint_data):
    np.savez(file_name, header = checkpoint_header, data = checkpoint_data)
    checkpoint_variable = np.load(file_name + ".npz")
    return checkpoint_variable

In [17]:
checkpoint_test = checkpoint("checkpoint-test", columns_string, loan_data_string)

In [18]:
checkpoint_test["data"]

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

In [19]:
# we can check two arrays of data is equal

np.array_equal(checkpoint_test["data"], loan_data_string)

True

## Manipulating string columns

In [20]:
columns_string

array(['issue_d', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

In [21]:
# change the name of first column from "issue_d" to "issue_date"
columns_string[0]="issue_date"

In [22]:
loan_data_string

array([['May-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=48010226', 'CA'],
       ['', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=57693261', 'NY'],
       ['Sep-15', 'Current', '36 months', ..., 'Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=59432726', 'PA'],
       ...,
       ['Jun-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=50415990', 'CA'],
       ['Apr-15', 'Current', '36 months', ..., 'Source Verified',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=46154151', 'OH'],
       ['Dec-15', 'Current', '36 months', ..., '',
        'https://www.lendingclub.com/browse/loanDetail.action?loan_id=66055249', 'IL']],
      dtype='<U69')

### issue date

In [23]:
# display the 1st column 
loan_data_string[:,0]

array(['May-15', '', 'Sep-15', ..., 'Jun-15', 'Apr-15', 'Dec-15'], dtype='<U69')

In [24]:
# unique function.....display different values in column
np.unique(loan_data_string[:,0])

array(['', 'Apr-15', 'Aug-15', 'Dec-15', 'Feb-15', 'Jan-15', 'Jul-15', 'Jun-15', 'Mar-15',
       'May-15', 'Nov-15', 'Oct-15', 'Sep-15'], dtype='<U69')

In [25]:
# we ca skip "-15" common part from the column

loan_data_string[:,0] = np.chararray.strip(loan_data_string[:,0], "-15")
np.unique(loan_data_string[:,0])

array(['', 'Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep'],
      dtype='<U69')

In [26]:
# in analysis, we put month values with number

months = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [27]:
for i in range(13):
    loan_data_string[:,0] = np.where(loan_data_string[:,0]==months[i], 
                                    i,
                                    loan_data_string[:,0])

In [28]:
np.unique(loan_data_string[:,0])

array(['0', '1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U69')

### Loan Status

In [29]:
columns_string

array(['issue_date', 'loan_status', 'term', 'grade', 'sub_grade', 'verification_status', 'url',
       'addr_state'], dtype='<U19')

In [33]:
np.unique(loan_data_string[:,1])

array(['', 'Charged Off', 'Current', 'Default', 'Fully Paid', 'In Grace Period', 'Issued',
       'Late (16-30 days)', 'Late (31-120 days)'], dtype='<U69')

In [38]:
# find the number of values in column

np.unique(loan_data_string[:,1]).size

9

In [39]:
status_bad = ['Charged Off', 'Late (31-120 days)', 'Default']

In [42]:
loan_data_string[:,1] = np.where(np.isin(loan_data_string[:,1], status_bad), 0, 1)

In [43]:
np.unique(loan_data_string[:,1])

array(['0', '1'], dtype='<U69')