# Pandas

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('loan_data.csv')

In [3]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707.0,2760.0,33623.0,76.7,0.0,0.0,0,0
2,1,,0.1357,366.86,10.373491,11.63,682.0,4710.0,3511.0,25.6,1.0,0.0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712.0,2699.958333,33667.0,73.2,1.0,0.0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667.0,4066.0,4740.0,39.5,0.0,1.0,0,0


# Summarize Data

In [4]:
df.purpose.value_counts() # gives freq (counts) of categories in a categorical var (e.g. purpose)

debt_consolidation    3955
all_other             2331
credit_card           1262
home_improvement       629
small_business         619
major_purchase         436
educational            343
Name: purpose, dtype: int64

In [5]:
print(len(df)) 

print(len(df.purpose)) 


9578
9578


In [6]:
df.purpose.unique()

array(['debt_consolidation', 'credit_card', nan, 'all_other',
       'home_improvement', 'small_business', 'major_purchase',
       'educational'], dtype=object)

In [7]:
df.purpose.nunique()

7

In [8]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9575 non-null   object 
 2   int.rate           9577 non-null   float64
 3   installment        9577 non-null   float64
 4   log.annual.inc     9577 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9577 non-null   float64
 7   days.with.cr.line  9577 non-null   float64
 8   revol.bal          9577 non-null   float64
 9   revol.util         9577 non-null   float64
 10  inq.last.6mths     9577 non-null   float64
 11  delinq.2yrs        9577 non-null   float64
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(10), int64(3), object(1)
memory usage: 1.0+ MB


### Some functions for summary statistics in Pandas: sum, count, mean, median, min, max...

See more: https://pandas.pydata.org/docs/reference/frame.html#computations-descriptive-stats

In [9]:
df.installment.sum() 

3056028.86

In [10]:
df.installment.mean()

319.10085204134907

In [11]:
df.installment.max()

940.14

In [12]:
df.installment.var() 

42881.74823617862

# Handling Missing Data
Some functions for handling missing data:
- isnull(): Generate a boolean mask indicating missing values
- notnull(): Opposite of isnull()
- dropna(): Return a filtered version of the data
- fillna(): Return a copy of the data with missing values filled or imputed


In [13]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707.0,2760.0,33623.0,76.7,0.0,0.0,0,0
2,1,,0.1357,366.86,10.373491,11.63,682.0,4710.0,3511.0,25.6,1.0,0.0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712.0,2699.958333,33667.0,73.2,1.0,0.0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667.0,4066.0,4740.0,39.5,0.0,1.0,0,0


In [14]:
df.isnull() 

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9574,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9575,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9576,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [15]:
df[df.purpose.isnull()] 

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
2,1,,0.1357,366.86,10.373491,11.63,682.0,4710.0,3511.0,25.6,1.0,0.0,0,0
9,1,,,84.12,10.203592,10.0,707.0,2730.041667,5630.0,,1.0,0.0,0,0
15,1,,0.1103,327.53,10.738915,13.04,702.0,8159.958333,5394.0,53.4,1.0,0.0,0,0


In [16]:
df[df.purpose.notnull()] 


Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707.0,2760.000000,33623.0,76.7,0.0,0.0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712.0,2699.958333,33667.0,73.2,1.0,0.0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667.0,4066.000000,4740.0,39.5,0.0,1.0,0,0
5,1,credit_card,0.0788,125.13,,16.98,727.0,6120.041667,,51.0,,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672.0,10474.000000,215372.0,82.1,2.0,0.0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722.0,4380.000000,184.0,1.1,5.0,0.0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687.0,3450.041667,10036.0,82.9,8.0,0.0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692.0,1800.000000,0.0,3.2,5.0,0.0,0,1


In [17]:
df.dropna()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707.0,2760.000000,33623.0,76.7,0.0,0.0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712.0,2699.958333,33667.0,73.2,1.0,0.0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667.0,4066.000000,4740.0,39.5,0.0,1.0,0,0
6,1,debt_consolidation,0.1496,194.02,10.714418,4.00,667.0,3180.041667,3839.0,76.8,0.0,0.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672.0,10474.000000,215372.0,82.1,2.0,0.0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722.0,4380.000000,184.0,1.1,5.0,0.0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687.0,3450.041667,10036.0,82.9,8.0,0.0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692.0,1800.000000,0.0,3.2,5.0,0.0,0,1


In [18]:
df.dropna(how = 'all') 

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707.0,2760.000000,33623.0,76.7,0.0,0.0,0,0
2,1,,0.1357,366.86,10.373491,11.63,682.0,4710.000000,3511.0,25.6,1.0,0.0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712.0,2699.958333,33667.0,73.2,1.0,0.0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667.0,4066.000000,4740.0,39.5,0.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672.0,10474.000000,215372.0,82.1,2.0,0.0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722.0,4380.000000,184.0,1.1,5.0,0.0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687.0,3450.041667,10036.0,82.9,8.0,0.0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692.0,1800.000000,0.0,3.2,5.0,0.0,0,1


In [19]:
df['purpose'] = df.purpose.fillna('missing') 
df


Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707.0,2760.000000,33623.0,76.7,0.0,0.0,0,0
2,1,missing,0.1357,366.86,10.373491,11.63,682.0,4710.000000,3511.0,25.6,1.0,0.0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712.0,2699.958333,33667.0,73.2,1.0,0.0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667.0,4066.000000,4740.0,39.5,0.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672.0,10474.000000,215372.0,82.1,2.0,0.0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722.0,4380.000000,184.0,1.1,5.0,0.0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687.0,3450.041667,10036.0,82.9,8.0,0.0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692.0,1800.000000,0.0,3.2,5.0,0.0,0,1


In [20]:
df = pd.read_csv('loan_data.csv')
df['purpose'] = df.purpose.fillna(method = 'bfill')
df

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707.0,2760.000000,33623.0,76.7,0.0,0.0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682.0,4710.000000,3511.0,25.6,1.0,0.0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712.0,2699.958333,33667.0,73.2,1.0,0.0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667.0,4066.000000,4740.0,39.5,0.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672.0,10474.000000,215372.0,82.1,2.0,0.0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722.0,4380.000000,184.0,1.1,5.0,0.0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687.0,3450.041667,10036.0,82.9,8.0,0.0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692.0,1800.000000,0.0,3.2,5.0,0.0,0,1


In [21]:
df = pd.read_csv('loan_data.csv')
df['purpose'] = df.purpose.fillna(method = 'ffill') 
df

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737.0,5639.958333,28854.0,52.1,0.0,0.0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707.0,2760.000000,33623.0,76.7,0.0,0.0,0,0
2,1,credit_card,0.1357,366.86,10.373491,11.63,682.0,4710.000000,3511.0,25.6,1.0,0.0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712.0,2699.958333,33667.0,73.2,1.0,0.0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667.0,4066.000000,4740.0,39.5,0.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672.0,10474.000000,215372.0,82.1,2.0,0.0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722.0,4380.000000,184.0,1.1,5.0,0.0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687.0,3450.041667,10036.0,82.9,8.0,0.0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692.0,1800.000000,0.0,3.2,5.0,0.0,0,1
