In [1]:
import pandas as pd
import numpy as np
import datetime
import boto3
from matplotlib import pyplot as plt
from matplotlib.dates import DateFormatter, date2num
import seaborn as sns
import scipy.stats as stats
import difflib

In [2]:
#reads in sample of kiva data (10k rows)
loan = pd.read_csv('../data/sample_data.csv')

In [3]:
#converting string representations of time to Datetime objects
loan['posted_datetime'] = pd.to_datetime(loan['POSTED_TIME'])
loan['raised_datetime'] = pd.to_datetime(loan['RAISED_TIME'])

#creates loan_speed column as difference between raised and posted datetimes
loan['loan_speed'] = loan['raised_datetime']-loan['posted_datetime']

#represents time to raising loan in number of days (for matplotlib)
loan['loanspeed_days'] = loan['loan_speed'] / pd.Timedelta(hours=24)

#drops na values
loan.dropna(subset=['raised_datetime'], inplace=True)

#creates loan_year column 
loan['loan_year'] = [loan['raised_datetime'][x].year for x in loan.index]


In [4]:
#Function for counting number of borrowers from number of entries in gender column 
def count_borrowers(lst):
    if type(lst) != float:
        return len(lst.split(','))
    else:
        return 1

loan['borrower_n'] = loan['BORROWER_GENDERS'].apply(count_borrowers)


In [5]:
#reads in csv of purchasing power parity values from world bank
#first four rows are skipped to avoid irrelevant metadata
ppp = pd.read_csv('../data/world_bank_ppp.csv', skiprows=4, index_col='Country Name')

#Renames Country Name column to match loan dataframe for merge 
#ppp.rename(columns={'Country Name': 'COUNTRY_NAME'}, inplace=True)



In [6]:
def rel_ppp(country, loan_year):
    for c, y in zip(country, loan_year):
        if y == 2020:
            loan.at[c, '2019']
        elif y <= 2020:
            return ppp.at[c, str(y)]
        else:
            return 0

rel_ppp(loan['COUNTRY_NAME'], loan['loan_year'])

0.580266058444977

In [7]:


#Check for spelling difference between country names in loan data and county names in ppp data

ppp.reset_index(inplace=True)

#set all countries names to lower case 
loan['COUNTRY_NAME'] = [c.lower() for c in loan['COUNTRY_NAME']]
ppp['Country Name'] = [c.lower() for c in ppp['Country Name']]

#returns list of countries that are not matches between the data sets
for c in set(loan['COUNTRY_NAME']):
    match_tup = (c, difflib.get_close_matches(c, ppp['Country Name'], cutoff=1))
    if match_tup[1] == []:
        print(match_tup[0])
        



myanmar (burma)
lao people's democratic republic
congo
the democratic republic of the congo
yemen
palestine
egypt
kyrgyzstan


In [8]:
#Only 8 out of 82 countries are not matches. Most of the remaining difference are due to political
#differences in naming, rather than failure to do a fuzzy match. 



ppp = ppp.rename(index={'west bank and gaza':'palestine', 'congo, rep':'congo', 'myanmar ':'myanmar (burma)',
'yemen, rep.':'yemen', 'lao pdr':'lao people''s democratic republic',
'congo, dem. rep.':'the democratic republic of the congo', 'egypt arab rep.': 'egypt'})

In [9]:
ppp.set_index('Country Name', inplace=True)
ppp.index

Index(['aruba', 'afghanistan', 'angola', 'albania', 'andorra', 'arab world',
       'united arab emirates', 'argentina', 'armenia', 'american samoa',
       ...
       'virgin islands (u.s.)', 'vietnam', 'vanuatu', 'world', 'samoa',
       'kosovo', 'yemen, rep.', 'south africa', 'zambia', 'zimbabwe'],
      dtype='object', name='Country Name', length=264)

In [10]:
    
holder = []
for c,y in zip(loan['COUNTRY_NAME'], loan['loan_year']):
    holder.append([c,y])




In [11]:
for i in range(len(holder)):
    if holder[i][0] not in ppp.index:
        holder[i].append(np.nan)
    elif np.isnan(ppp.loc[holder[i][0], str(holder[i][1])]):
        if (ppp.loc[holder[i][0], str(holder[i][1]-1)]):
            holder[i].append(ppp.loc[holder[i][0], str(holder[i][1]-1)])
        elif (ppp.loc[holder[i][0], str(holder[i][1]-2)]):
            holder[i].append(ppp.loc[holder[i][0], str(holder[i][1]-2)])
        else:    
            holder[i].append(np.nan)
    else:
        holder[i].append(ppp.loc[holder[i][0],str(holder[i][1])])
        



In [14]:
loan['ppp'] = [holder[c][2] for c in range(len(holder))]



In [15]:

loan['ppp'].dropna(inplace=True)

In [19]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9500 entries, 0 to 9999
Data columns (total 42 columns):
 #   Column                           Non-Null Count  Dtype              
---  ------                           --------------  -----              
 0   Unnamed: 0                       9500 non-null   int64              
 1   LOAN_ID                          9500 non-null   int64              
 2   LOAN_NAME                        9264 non-null   object             
 3   ORIGINAL_LANGUAGE                9284 non-null   object             
 4   DESCRIPTION                      9284 non-null   object             
 5   DESCRIPTION_TRANSLATED           7124 non-null   object             
 6   FUNDED_AMOUNT                    9500 non-null   float64            
 7   LOAN_AMOUNT                      9500 non-null   float64            
 8   STATUS                           9500 non-null   object             
 9   IMAGE_ID                         9284 non-null   float64            
 10  

In [20]:
loan['ppp_val'] = loan['ppp'] * loan['LOAN_AMOUNT']

0         478.719498
1       10010.628319
2         628.918314
3        4283.709240
4                NaN
            ...     
9995      847.752582
9996     6802.964211
9997     4258.328247
9998      304.920734
9999      904.070076
Name: ppp_val, Length: 9500, dtype: float64