# 10 Minutes to Pandas

> Link : https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html

----

In [1]:
import pandas as pd
import numpy as np

## Object Creation

In [2]:
s = pd.Series([2, 3, 5, np.nan, 0])
s

0    2.0
1    3.0
2    5.0
3    NaN
4    0.0
dtype: float64

In [3]:
dates = pd.date_range('2020-01-01', periods = 6)
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randint(0,10,size=(6,4)), index=dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2020-01-01,7,4,0,2
2020-01-02,3,0,0,9
2020-01-03,6,8,7,6
2020-01-04,5,3,0,1
2020-01-05,3,9,2,7
2020-01-06,8,8,9,7


 DataFrame by passing a dict of objects that can be converted to series-like.

In [5]:
np.array(list(range(0,4)))

array([0, 1, 2, 3])

In [6]:
np.random.randint(0,100,size=(4,1))

array([[48],
       [88],
       [31],
       [76]])

In [7]:
np.array([3] * 4, dtype='int32')

array([3, 3, 3, 3])

In [8]:
np.array(np.random.randint(0,100,4))

array([68, 87, 18, 25])

In [9]:
df2 = pd.DataFrame({
    'A': 1,
    'B': np.array(np.random.randint(0,100,size=(4))),
    'C': pd.Series([1,2,56,87], dtype='float32'),
    'D': np.array(list(range(0,4)))
#     'D': np.array([3] * 4, dtype = 'int32')
#          np.array([3] * 4, dtype='int32')
},
index=list(range(0,4)))

df2

Unnamed: 0,A,B,C,D
0,1,69,1.0,0
1,1,29,2.0,1
2,1,68,56.0,2
3,1,44,87.0,3


In [10]:
df2 = pd.DataFrame({'A': 1.,
   ...:                     'B': pd.Timestamp('20130102'),
   ...:                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   ...:                     'D': np.array([3] * 4, dtype='int32'),
   ...:                     'E': pd.Categorical(["test", "train", "test", "train"]),
   ...:                     'F': 'foo'})

In [11]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [12]:
df = pd.read_csv("Loan_Prediction.csv", index_col = "Loan_ID")

## Viewing data

In [13]:
df.head(5)

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [14]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [15]:
df.shape

(614, 12)

In [16]:
df.shape[1]

12

In [17]:
df.shape[0]

614

In [18]:
df.T

Loan_ID,LP001002,LP001003,LP001005,LP001006,LP001008,LP001011,LP001013,LP001014,LP001018,LP001020,...,LP002959,LP002960,LP002961,LP002964,LP002974,LP002978,LP002979,LP002983,LP002984,LP002990
Gender,Male,Male,Male,Male,Male,Male,Male,Male,Male,Male,...,Female,Male,Male,Male,Male,Female,Male,Male,Male,Female
Married,No,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No
Dependents,0,1,0,0,0,2,0,3+,2,1,...,1,0,1,2,0,0,3+,1,2,0
Education,Graduate,Graduate,Graduate,Not Graduate,Graduate,Graduate,Not Graduate,Graduate,Graduate,Graduate,...,Graduate,Not Graduate,Graduate,Not Graduate,Graduate,Graduate,Graduate,Graduate,Graduate,Graduate
Self_Employed,No,No,Yes,No,No,Yes,No,No,No,No,...,No,No,No,No,No,No,No,No,No,Yes
ApplicantIncome,5849,4583,3000,2583,6000,5417,2333,3036,4006,12841,...,12000,2400,3400,3987,3232,2900,4106,8072,7583,4583
CoapplicantIncome,0,1508,0,2358,0,4196,1516,2504,1526,10968,...,0,3800,2500,1411,1950,0,0,240,0,0
LoanAmount,,128,66,120,141,267,95,158,168,349,...,496,,173,157,108,71,40,253,187,133
Loan_Amount_Term,360,360,360,360,360,360,360,360,360,360,...,360,180,360,360,360,360,180,360,360,360
Credit_History,1,1,1,1,1,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,0


## Sorting by axis

In [19]:
df.sort_values(by='ApplicantIncome', ascending=False)

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP002317,Male,Yes,3+,Graduate,No,81000,0.0,360.0,360.0,0.0,Rural,N
LP002101,Male,Yes,0,Graduate,,63337,0.0,490.0,180.0,1.0,Urban,Y
LP001585,,Yes,3+,Graduate,No,51763,0.0,700.0,300.0,1.0,Urban,Y
LP001536,Male,Yes,3+,Graduate,No,39999,0.0,600.0,180.0,0.0,Semiurban,Y
LP001640,Male,Yes,0,Graduate,Yes,39147,4750.0,120.0,360.0,1.0,Semiurban,Y
LP002422,Male,No,1,Graduate,No,37719,0.0,152.0,360.0,1.0,Semiurban,Y
LP001637,Male,Yes,1,Graduate,No,33846,0.0,260.0,360.0,1.0,Semiurban,N
LP001448,,Yes,3+,Graduate,No,23803,0.0,370.0,360.0,1.0,Rural,Y
LP002624,Male,Yes,0,Graduate,No,20833,6667.0,480.0,360.0,,Urban,Y
LP001922,Male,Yes,0,Graduate,No,20667,0.0,,360.0,1.0,Rural,N


In [20]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N
LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
LP002964,Male,Yes,2,Not Graduate,No,3987,1411.0,157.0,360.0,1.0,Rural,Y
LP002961,Male,Yes,1,Graduate,No,3400,2500.0,173.0,360.0,1.0,Semiurban,Y
LP002960,Male,Yes,0,Not Graduate,No,2400,3800.0,,180.0,1.0,Urban,N
LP002959,Female,Yes,1,Graduate,No,12000,0.0,496.0,360.0,1.0,Semiurban,Y


## Selection

.at, .iat, .loc and .iloc.

In [21]:
df['Gender'].head().to_frame()

Unnamed: 0_level_0,Gender
Loan_ID,Unnamed: 1_level_1
LP001002,Male
LP001003,Male
LP001005,Male
LP001006,Male
LP001008,Male


In [22]:
df[0:3]

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [23]:
df['LP001002':'LP001010']

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Selection by Label and by Position

In [24]:
df.loc[:,['Gender','Married']].head()

Unnamed: 0_level_0,Gender,Married
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
LP001002,Male,No
LP001003,Male,Yes
LP001005,Male,Yes
LP001006,Male,Yes
LP001008,Male,No


In [25]:
df.iloc[:,2].head().to_frame()

Unnamed: 0_level_0,Dependents
Loan_ID,Unnamed: 1_level_1
LP001002,0
LP001003,1
LP001005,0
LP001006,0
LP001008,0


Fast Access to a scalar object using 'at' and 'iat' :

In [26]:
df.at['LP001002','Gender']

'Male'

In [27]:
df.iat[0,0]

'Male'

## Boolean Indexing

In [28]:
df[df['Gender'] == 'Male']

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [29]:
df[df['ApplicantIncome'] > 1000].sort_values(by='LoanAmount')

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP002840,Female,No,0,Graduate,No,2378,0.0,9.0,360.0,1.0,Urban,N
LP001030,Male,Yes,2,Graduate,No,1299,1086.0,17.0,120.0,1.0,Urban,Y
LP001325,Male,No,0,Not Graduate,No,3620,0.0,25.0,120.0,1.0,Semiurban,Y
LP001482,Male,Yes,0,Graduate,Yes,3459,0.0,25.0,120.0,1.0,Semiurban,Y
LP002792,Male,Yes,1,Graduate,No,5468,1032.0,26.0,360.0,1.0,Semiurban,Y
LP001888,Female,No,0,Graduate,No,3237,0.0,30.0,360.0,1.0,Urban,Y
LP001518,Male,Yes,1,Graduate,No,1538,1425.0,30.0,360.0,1.0,Urban,Y
LP001086,Male,No,0,Not Graduate,No,1442,0.0,35.0,360.0,1.0,Urban,N
LP002894,Female,Yes,0,Graduate,No,3166,0.0,36.0,360.0,1.0,Semiurban,Y
LP002634,Female,No,1,Graduate,No,13262,0.0,40.0,360.0,1.0,Urban,Y


In [30]:
len(df)

614

In [31]:
df['Rank'] = np.arange(0,614)

In [32]:
df.tail()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Rank
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,609
LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,610
LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,611
LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,612
LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N,613


In [33]:
df[df['Rank'].isin([1,2])]

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Rank
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,1
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,2


## Setting

In [34]:
df.loc[:, 'D'] = 5

In [35]:
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Rank,D
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,0,5
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,1,5
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,2,5
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,3,5
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,4,5


## Missing Data

To drop any rows that have missing data.

In [36]:
df.dropna(how='any').shape

(480, 14)

In [37]:
df.shape

(614, 14)

Filling missing data.

In [38]:
df.fillna(value=5)

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Rank,D
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,5.0,360.0,1.0,Urban,Y,0,5
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,1,5
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,2,5
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,3,5
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,4,5
LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,5,5
LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y,6,5
LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N,7,5
LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y,8,5
LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N,9,5


To get the boolean mask where values are nan

In [39]:
pd.isna(df)

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Rank,D
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
LP001002,False,False,False,False,False,False,False,True,False,False,False,False,False,False
LP001003,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LP001005,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LP001006,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LP001008,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LP001011,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LP001013,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LP001014,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LP001018,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LP001020,False,False,False,False,False,False,False,False,False,False,False,False,False,False


## Operations

> Stats

In [40]:
df.mean()

ApplicantIncome      5403.459283
CoapplicantIncome    1621.245798
LoanAmount            146.412162
Loan_Amount_Term      342.000000
Credit_History          0.842199
Rank                  306.500000
D                       5.000000
dtype: float64

In [41]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Rank,D
count,614.0,614.0,592.0,600.0,564.0,614.0,614.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199,306.5,5.0
std,6109.041673,2926.248369,85.587325,65.12041,0.364878,177.390811,0.0
min,150.0,0.0,9.0,12.0,0.0,0.0,5.0
25%,2877.5,0.0,100.0,360.0,1.0,153.25,5.0
50%,3812.5,1188.5,128.0,360.0,1.0,306.5,5.0
75%,5795.0,2297.25,168.0,360.0,1.0,459.75,5.0
max,81000.0,41667.0,700.0,480.0,1.0,613.0,5.0


> Apply

In [42]:
df.select_dtypes(include=[np.float64]).apply(lambda x: x.max() - x.min())

CoapplicantIncome    41667.0
LoanAmount             691.0
Loan_Amount_Term       468.0
Credit_History           1.0
dtype: float64

In [43]:
 df.select_dtypes(include=['number']).dtypes

ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Rank                   int32
D                      int64
dtype: object

In [44]:
 df.select_dtypes(include=['floating']).dtypes

CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
dtype: object

In [45]:
 df.select_dtypes(include=['integer']).dtypes

ApplicantIncome    int64
Rank               int32
D                  int64
dtype: object

> https://stackoverflow.com/questions/21271581/selecting-pandas-columns-by-dtype

In [46]:
df['ApplicantIncome'] = df['ApplicantIncome'].astype('float64')

In [47]:
df.select_dtypes(include=['number']).dtypes

ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Rank                   int32
D                      int64
dtype: object

In [48]:
df.drop(['Rank','D'], axis = 1, inplace = True)

In [49]:
df.select_dtypes(include=['number']).dtypes

ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
dtype: object

In [50]:
df.Gender.value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

## Merge

> Concat : Works like Union

In [51]:
df1 = df[:2]
df1

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849.0,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N


In [52]:
df2 = df[-2:]
df2

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP002984,Male,Yes,2,Graduate,No,7583.0,0.0,187.0,360.0,1.0,Urban,Y
LP002990,Female,No,0,Graduate,Yes,4583.0,0.0,133.0,360.0,0.0,Semiurban,N


In [53]:
df3 = pd.concat([df1, df2])
df3

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849.0,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N
LP002984,Male,Yes,2,Graduate,No,7583.0,0.0,187.0,360.0,1.0,Urban,Y
LP002990,Female,No,0,Graduate,Yes,4583.0,0.0,133.0,360.0,0.0,Semiurban,N


> Left join

In [54]:
df1 = df.loc[:,['Married', 'Dependents']]

In [55]:
df2 = df.loc[:,['ApplicantIncome', 'Loan_Amount_Term']]

In [56]:
df3 = pd.merge(df1, df2, how='left', 
         left_index=True, right_index=True, sort=True,
         suffixes=('_x', '_y'), copy=True,
         validate=None)

df3.head()

Unnamed: 0_level_0,Married,Dependents,ApplicantIncome,Loan_Amount_Term
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LP001002,No,0,5849.0,360.0
LP001003,Yes,1,4583.0,360.0
LP001005,Yes,0,3000.0,360.0
LP001006,Yes,0,2583.0,360.0
LP001008,No,0,6000.0,360.0


In [57]:
married = pd.DataFrame({'Yes': ['Married'],
                       'No' : ['Not Married']
                       })
married

Unnamed: 0,Yes,No
0,Married,Not Married


In [58]:
married = married.transpose()

In [59]:
married

Unnamed: 0,0
Yes,Married
No,Not Married


In [60]:
df4 = pd.merge(df3, married, how='left', right_index=True, left_on = 'Married')

In [61]:
df4 = df4.rename({0: "Married_Desc"}, axis = 'columns')
df4.head()

Unnamed: 0_level_0,Married,Dependents,ApplicantIncome,Loan_Amount_Term,Married_Desc
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LP001002,No,0,5849.0,360.0,Not Married
LP001003,Yes,1,4583.0,360.0,Married
LP001005,Yes,0,3000.0,360.0,Married
LP001006,Yes,0,2583.0,360.0,Married
LP001008,No,0,6000.0,360.0,Not Married


## Grouping

By “group by” we are referring to a process involving one or more of the following steps:

> Splitting the data into groups based on some criteria

> Applying a function to each group independently

> Combining the results into a data structure

### Grouping and then applying the sum() function to the resulting groups.

In [62]:
df.groupby(by = ['Married','Dependents'] ).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
Married,Dependents,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
No,0,824287.0,216426.0,20956.0,58956.0,138.0
No,1,145557.0,10610.0,2607.0,7476.0,17.0
No,2,32738.0,6731.0,1019.0,2880.0,5.0
No,3+,41060.0,46660.0,1550.0,2340.0,5.0
Yes,0,872224.0,343749.120001,24430.0,57312.0,133.0
Yes,1,462595.0,134866.0,12938.0,25788.0,59.0
Yes,2,464867.0,163680.799988,13703.0,31548.0,76.0
Yes,3+,396582.0,56580.0,7804.0,13920.0,32.0


In [63]:
df.groupby(by = ['Married','Dependents'] ).sum().reset_index()

Unnamed: 0,Married,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,No,0,824287.0,216426.0,20956.0,58956.0,138.0
1,No,1,145557.0,10610.0,2607.0,7476.0,17.0
2,No,2,32738.0,6731.0,1019.0,2880.0,5.0
3,No,3+,41060.0,46660.0,1550.0,2340.0,5.0
4,Yes,0,872224.0,343749.120001,24430.0,57312.0,133.0
5,Yes,1,462595.0,134866.0,12938.0,25788.0,59.0
6,Yes,2,464867.0,163680.799988,13703.0,31548.0,76.0
7,Yes,3+,396582.0,56580.0,7804.0,13920.0,32.0


## Pivot tables

In [64]:
pd.pivot_table(df, index=['Married', 'Dependents'], values='LoanAmount', aggfunc=sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,LoanAmount
Married,Dependents,Unnamed: 2_level_1
No,0,20956.0
No,1,2607.0
No,2,1019.0
No,3+,1550.0
Yes,0,24430.0
Yes,1,12938.0
Yes,2,13703.0
Yes,3+,7804.0


In [66]:
output = pd.pivot_table(df, index=['Married', 'Dependents'], columns=['Self_Employed'] , values='LoanAmount', aggfunc=sum) 

output

Unnamed: 0_level_0,Self_Employed,No,Yes
Married,Dependents,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0,16291.0,3370.0
No,1,1601.0,666.0
No,2,899.0,
No,3+,908.0,292.0
Yes,0,20511.0,2285.0
Yes,1,9778.0,2648.0
Yes,2,10112.0,3187.0
Yes,3+,6762.0,932.0


## Export to CSV/Excel

In [67]:
output.to_csv('output.csv')

In [70]:
output.reset_index().to_excel('foo.xlsx', sheet_name='Sheet1')