In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn; seaborn.set()

%matplotlib inline

# Loading, Describing and Cleaning

In [2]:
loans = pd.read_csv('data/kiva_loans.csv')
loans.head()

Unnamed: 0,id,funded_amount,loan_amount,activity,sector,use,country_code,country,region,currency,partner_id,posted_time,disbursed_time,funded_time,term_in_months,lender_count,tags,borrower_genders,repayment_interval,date
0,653051,300.0,300.0,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:12:39+00:00,2013-12-17 08:00:00+00:00,2014-01-02 10:06:32+00:00,12.0,12,,female,irregular,2014-01-01
1,653053,575.0,575.0,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:51:08+00:00,2013-12-17 08:00:00+00:00,2014-01-02 09:17:23+00:00,11.0,14,,"female, female",irregular,2014-01-01
2,653068,150.0,150.0,Transportation,Transportation,To repair their old cycle-van and buy another ...,IN,India,Maynaguri,INR,334.0,2014-01-01 09:58:07+00:00,2013-12-17 08:00:00+00:00,2014-01-01 16:01:36+00:00,43.0,6,"user_favorite, user_favorite",female,bullet,2014-01-01
3,653063,200.0,200.0,Embroidery,Arts,to purchase an embroidery machine and a variet...,PK,Pakistan,Lahore,PKR,247.0,2014-01-01 08:03:11+00:00,2013-12-24 08:00:00+00:00,2014-01-01 13:00:00+00:00,11.0,8,,female,irregular,2014-01-01
4,653084,400.0,400.0,Milk Sales,Food,to purchase one buffalo.,PK,Pakistan,Abdul Hakeem,PKR,245.0,2014-01-01 11:53:19+00:00,2013-12-17 08:00:00+00:00,2014-01-01 19:18:51+00:00,14.0,16,,female,monthly,2014-01-01


In [3]:
loans.describe()

Unnamed: 0,id,funded_amount,loan_amount,partner_id,term_in_months,lender_count
count,671205.0,671205.0,671205.0,657698.0,671205.0,671205.0
mean,993248.6,785.995061,842.397107,178.199616,13.739022,20.590922
std,196611.3,1130.398941,1198.660073,94.247581,8.598919,28.459551
min,653047.0,0.0,25.0,9.0,1.0,0.0
25%,823072.0,250.0,275.0,126.0,8.0,7.0
50%,992780.0,450.0,500.0,145.0,13.0,13.0
75%,1163653.0,900.0,1000.0,204.0,14.0,24.0
max,1340339.0,100000.0,100000.0,536.0,158.0,2986.0


- The mean of funded_amount which is The amount disbursed by Kiva to the field agent(USD) is lower than loan_amount which is The amount disbursed by the field agent to the borrower(USD), and from the min in describe function above, it seems that there's a case(s) where Kiva didn't disburse money to the field agent but the field agent always disbursed money.
- The data is right skewed since the mean is greater than the median (50% percentile), which means than on average, more loan amounts disbursed are greater than 450 USD.
- On average, it takes approximately 13 months to disburse a loan with some as fast as 1 month and other taking longer like 158 months. I think this value represent the total amount of time it took to disburse the full loan amount.
- The average number of lenders who contributed to a loan is approximately 20, with some as low as 0 and some as high as 2986.
- Some partners are lacking id.

In [6]:
# check for nulls
loans.isnull().any()

id                    False
funded_amount         False
loan_amount           False
activity              False
sector                False
use                    True
country_code           True
country               False
region                 True
currency              False
partner_id             True
posted_time           False
disbursed_time         True
funded_time            True
term_in_months        False
lender_count          False
tags                   True
borrower_genders       True
repayment_interval    False
date                  False
dtype: bool

- The use column contains Null values and since it contains long strings and the activity and sector columns pretty much summarises them, I'll drop this column instead of trying to fill it out.
- The country_code column also contains Null values but country column does not, which means that we can find out which countries are missing value a corresponding country code and fill them out.
- The region column also contains Null values. Since this column might be useful in finding out which regions per country get loans, we'll try to find a way of filling them. Like using the most common region from that particular country, otherwise drop if not possible. 

In [13]:
loans = loans.drop('use', axis=1)

In [16]:
loans.loc[loans['country_code'].isnull(), 'country'].unique()

array(['Namibia'], dtype=object)

In [17]:
# Only Namibia is missing a correspodning country code
# From a Google search, I got that the country's ISO country code is NAM
# So I'll fill that in.
loans.loc[loans['country'] == 'Namibia', 'country_code'] = 'NAM'

In [23]:
# Number of Nulls in the region column
loans.loc[loans['region'].isnull(), 'country'].count()

56800

There's a total of 671205 data points in the entire dataset so just dropping these values (56800 of them) is not an option.

In [26]:
# Finding most common region in a country 
loans.loc[loans['region'].isnull()][:10]

Unnamed: 0,id,funded_amount,loan_amount,activity,sector,country_code,country,region,currency,partner_id,posted_time,disbursed_time,funded_time,term_in_months,lender_count,tags,borrower_genders,repayment_interval,date
5,1080148,250.0,250.0,Services,Services,KE,Kenya,,KES,,2014-01-01 10:06:19+00:00,2014-01-30 01:42:48+00:00,2014-01-29 14:14:57+00:00,4.0,6,,female,irregular,2014-01-01
49,653418,450.0,450.0,General Store,Retail,SV,El Salvador,,USD,81.0,2014-01-02 22:17:55+00:00,2013-12-21 08:00:00+00:00,2014-01-30 00:19:42+00:00,14.0,18,"#Repeat Borrower, user_favorite",male,monthly,2014-01-02
54,653380,225.0,225.0,Food Market,Food,SN,Senegal,,XOF,108.0,2014-01-02 16:04:50+00:00,2013-12-17 08:00:00+00:00,2014-01-03 02:15:13+00:00,14.0,7,,female,monthly,2014-01-02
67,1080150,125.0,125.0,Energy,Services,KE,Kenya,,KES,,2014-01-02 08:48:38+00:00,2014-01-30 01:42:21+00:00,2014-01-23 13:35:59+00:00,3.0,6,,male,irregular,2014-01-02
70,653244,2000.0,2000.0,Retail,Retail,IQ,Iraq,,USD,166.0,2014-01-02 09:15:05+00:00,2013-12-28 08:00:00+00:00,2014-01-31 16:27:13+00:00,15.0,71,"#Schooling, #Parent, #Biz Durable Asset, user_...",male,monthly,2014-01-02
99,1080153,5000.0,5000.0,Food Production/Sales,Food,US,United States,,USD,,2014-01-02 20:13:47+00:00,2014-03-10 22:53:12+00:00,2014-03-11 05:53:06+00:00,30.0,261,,female,monthly,2014-01-02
102,653399,3975.0,3975.0,Food Stall,Food,PE,Peru,,PEN,93.0,2014-01-02 17:08:53+00:00,2013-12-16 08:00:00+00:00,2014-01-04 00:26:05+00:00,6.0,84,,"female, female, female, female, female, female...",irregular,2014-01-02
111,653376,225.0,225.0,Retail,Retail,SN,Senegal,,XOF,108.0,2014-01-02 15:52:20+00:00,2013-12-17 08:00:00+00:00,2014-01-05 07:33:21+00:00,15.0,5,,female,monthly,2014-01-02
112,653256,1925.0,2400.0,Electronics Repair,Services,IQ,Iraq,,USD,166.0,2014-01-02 09:44:10+00:00,2013-12-29 08:00:00+00:00,,15.0,41,"#Single, #Supporting Family, #Eco-friendly, us...",male,monthly,2014-01-02
114,1080151,125.0,125.0,Energy,Services,KE,Kenya,,KES,,2014-01-02 10:43:30+00:00,2014-01-30 01:42:13+00:00,2014-01-23 09:47:34+00:00,3.0,7,,female,irregular,2014-01-02
