In [1]:
import pandas as pd
from DataFrameInfo import DataFrameInfo
from DataTransform import DataTransform

In [2]:
# Transform the dataframe

transformed_df = pd.read_csv("loan_payments_data.csv")
transformer = DataTransform(transformed_df)
excess_symbol_columns = ['term']

for column in excess_symbol_columns:
    transformer.remove_excess_symbols(column)


date_columns = ['issue_date', 'earliest_credit_line', 'last_payment_date', 'next_payment_date', 'last_credit_pull_date']

for column in date_columns:
    transformer.convert_to_datetime(column)


numeric_columns = ['term']

for column in numeric_columns:
    transformer.convert_to_numeric(column)

categorical_columns = ['home_ownership', 'grade', 'sub_grade', 'verification_status', 'loan_status', 'purpose', 'policy_code', 'application_type','employment_length']

for column in categorical_columns:
    transformer.convert_to_categorical(column)

category_to_numeric_columns = ['employment_length']

for column in category_to_numeric_columns:
    transformer.convert_categorical_to_numeric(column)

In [3]:
df_info = DataFrameInfo(transformed_df)

In [4]:
# Column Data Types

print("Column Data Types:")
print(df_info.describe_columns())

Column Data Types:
id                                      int64
member_id                               int64
loan_amount                             int64
funded_amount                         float64
funded_amount_inv                     float64
term                                  float64
int_rate                              float64
instalment                            float64
grade                                category
sub_grade                            category
employment_length                    category
home_ownership                       category
annual_inc                            float64
verification_status                  category
issue_date                     datetime64[ns]
loan_status                          category
payment_plan                           object
purpose                              category
dti                                   float64
delinq_2yrs                             int64
earliest_credit_line           datetime64[ns]
inq_last_6mths 

In [5]:
# Statistical Values

print("\nStatistical Values:")
print(df_info.extract_statistical_values())


Statistical Values:
                 id     member_id   loan_amount  funded_amount  \
count  5.423100e+04  5.423100e+04  54231.000000   51224.000000   
mean   7.621797e+06  8.655350e+06  13333.076100   13229.509117   
min    5.552100e+04  7.069400e+04    500.000000     500.000000   
25%    7.594330e+05  9.587720e+05   7000.000000    7000.000000   
50%    7.084590e+06  8.709873e+06  12000.000000   12000.000000   
75%    8.860616e+06  1.052714e+07  18000.000000   18000.000000   
max    3.867612e+07  4.146185e+07  35000.000000   35000.000000   
std    9.571362e+06  1.031281e+07   8082.196709    8019.017599   

       funded_amount_inv          term      int_rate    instalment  \
count       54231.000000  49459.000000  49062.000000  54231.000000   
mean        12952.622979     42.606199     13.507328    400.013953   
min             0.000000     36.000000      5.420000     15.670000   
25%          6700.000000     36.000000     10.370000    224.205000   
50%         11300.000000     36.00

In [6]:
# Distinct Values Count

print("\nDistinct Values Count:")
print(df_info.count_distinct_values())


Distinct Values Count:
id                             54231
member_id                      54231
loan_amount                     1083
funded_amount                   1122
funded_amount_inv               5261
term                               2
int_rate                         457
instalment                     19940
grade                              7
sub_grade                         35
employment_length                 11
home_ownership                     5
annual_inc                      6132
verification_status                3
issue_date                        61
loan_status                        9
payment_plan                       2
purpose                           14
dti                             3611
delinq_2yrs                       16
earliest_credit_line             587
inq_last_6mths                    24
mths_since_last_delinq           100
mths_since_last_record           120
open_accounts                     48
total_accounts                    88
out_prncp     

In [7]:
# Null Values Information

print("\nNull Values Information:")
print(df_info.count_null_values())


Null Values Information:
                             Null Count  Percentage Null
id                                    0         0.000000
member_id                             0         0.000000
loan_amount                           0         0.000000
funded_amount                      3007         5.544799
funded_amount_inv                     0         0.000000
term                               4772         8.799395
int_rate                           5169         9.531449
instalment                            0         0.000000
grade                                 0         0.000000
sub_grade                             0         0.000000
employment_length                  2118         3.905515
home_ownership                        0         0.000000
annual_inc                            0         0.000000
verification_status                   0         0.000000
issue_date                            0         0.000000
loan_status                           0         0.000000
payme

In [8]:
# Shape of date

df_info.print_shape()

DataFrame Shape: (54231, 43)
