In [1]:
import pandas as pd
import numpy as np

In [23]:
loansData = pd.read_csv('loansData.csv')

You won't always read the data like this. Sometimes, the column names are not defined or the data is not ',' separated, so you need to specify these things in the read_csv itself

What's the first thing you should do when you get a dataset?

In [3]:
loansData.head()

Unnamed: 0,CustNUm,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,81174,20000,20000.0,8.90%,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year
1,99592,19200,19200.0,12.12%,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years
2,80059,35000,35000.0,21.98%,60 months,debt_consolidation,23.81%,CA,MORTGAGE,11500.0,690-694,14.0,21977.0,1.0,2 years
3,15825,10000,9975.0,9.99%,36 months,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years
4,33182,12000,12000.0,11.71%,36 months,credit_card,18.78%,NJ,RENT,3195.0,695-699,11.0,14469.0,0.0,9 years


In [29]:
loansData.isna().any()

CustNUm                           False
Amount.Requested                  False
Amount.Funded.By.Investors        False
Interest.Rate                     False
Loan.Length                       False
Loan.Purpose                      False
Debt.To.Income.Ratio              False
State                             False
Home.Ownership                    False
Monthly.Income                     True
FICO.Range                        False
Open.CREDIT.Lines                  True
Revolving.CREDIT.Balance           True
Inquiries.in.the.Last.6.Months     True
Employment.Length                  True
dtype: bool

In [31]:
loansData.duplicated().any()

False

In [4]:
loansData.describe()

Unnamed: 0,CustNUm,Amount.Requested,Amount.Funded.By.Investors,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months
count,2500.0,2500.0,2500.0,2499.0,2498.0,2498.0,2498.0
mean,51631.5972,12406.5,12001.573236,5688.931321,10.075661,15244.559648,0.906325
std,30053.345508,7801.544872,7745.320754,3963.118185,4.508644,18308.549795,1.231036
min,10.0,1000.0,-0.01,588.5,2.0,0.0,0.0
25%,26523.0,6000.0,6000.0,3500.0,7.0,5585.75,0.0
50%,50653.0,10000.0,10000.0,5000.0,9.0,10962.0,0.0
75%,77580.5,17000.0,16000.0,6800.0,13.0,18888.75,1.0
max,104202.0,35000.0,35000.0,102750.0,38.0,270800.0,9.0


Wait, what happened? Why did we get summaries of only a few columns? What if I wanted to know the most common State?

In [12]:
loansData.State.mode()

0    CA
dtype: object

But what if I want a distribution for the State?

In [11]:
loansData.State.value_counts()

CA    433
NY    255
TX    174
FL    169
IL    101
GA     98
PA     96
NJ     94
VA     78
MA     73
OH     71
MD     68
NC     64
CO     61
WA     58
CT     50
AZ     46
MI     45
MN     38
AL     38
MO     33
NV     32
OR     30
SC     28
WI     26
KY     23
LA     22
OK     21
KS     21
UT     16
NH     15
WV     15
RI     15
AR     13
NM     13
HI     12
DC     11
AK     11
DE      8
MT      7
VT      5
WY      4
SD      4
IN      3
MS      1
IA      1
Name: State, dtype: int64

The column "Interest.Rate" consists of numbers. How do I get its summary?

In [17]:
type(loansData['Interest.Rate']), type(loansData['Interest.Rate'][0]),loansData['Interest.Rate'].dtype

(pandas.core.series.Series, str, dtype('O'))

In [24]:
loansData['Interest.Rate'] = loansData['Interest.Rate'].str.strip('%').astype(float)/100
loansData['Interest.Rate'].head()

0    0.0890
1    0.1212
2    0.2198
3    0.0999
4    0.1171
Name: Interest.Rate, dtype: float64

What is the average interest rate for each FICO range?

In [27]:
fico_grp = loansData[['Interest.Rate', 'FICO.Range']].groupby('FICO.Range').mean()
fico_grp

Unnamed: 0_level_0,Interest.Rate
FICO.Range,Unnamed: 1_level_1
640-644,0.15212
645-649,0.148833
650-654,0.1513
655-659,0.1493
660-664,0.184925
665-669,0.174481
670-674,0.162485
675-679,0.158547
680-684,0.151267
685-689,0.146549
