In [1]:
import pandas as pd
import numpy as np
import datetime
import boto3
from matplotlib import pyplot as plt
from matplotlib.dates import DateFormatter, date2num
import seaborn as sns
import scipy.stats as stats
import difflib

In [2]:
#reads in sample of kiva data (10k rows)
loan = pd.read_csv('../data/sample_data.csv')

In [3]:
#converting string representations of time to Datetime objects
loan['posted_datetime'] = pd.to_datetime(loan['POSTED_TIME'])
loan['raised_datetime'] = pd.to_datetime(loan['RAISED_TIME'])

#creates loan_speed column as difference between raised and posted datetimes
loan['loan_speed'] = loan['raised_datetime']-loan['posted_datetime']

#represents time to raising loan in number of days (for matplotlib)
loan['loanspeed_days'] = loan['loan_speed'] / pd.Timedelta(hours=24)

#drops na values
loan.dropna(subset=['raised_datetime'], inplace=True)

#creates loan_year column 
loan['loan_year'] = [loan['raised_datetime'][x].year for x in loan.index]


In [4]:
#Function for counting number of borrowers from number of entries in gender column 
def count_borrowers(lst):
    if type(lst) != float:
        return len(lst.split(','))
    else:
        return 1

loan['borrower_n'] = loan['BORROWER_GENDERS'].apply(count_borrowers)


In [5]:
#reads in csv of purchasing power parity values from world bank
#first four rows are skipped to avoid irrelevant metadata
ppp = pd.read_csv('../data/world_bank_ppp.csv', skiprows=4, index_col='Country Name')

#Renames Country Name column to match loan dataframe for merge 
#ppp.rename(columns={'Country Name': 'COUNTRY_NAME'}, inplace=True)



In [6]:
def rel_ppp(country, loan_year):
    for c, y in zip(country, loan_year):
        if y == 2020:
            loan.at[c, '2019']
        elif y <= 2020:
            return ppp.at[c, str(y)]
        else:
            return 0

rel_ppp(loan['COUNTRY_NAME'], loan['loan_year'])

0.580266058444977

In [7]:


#Check for spelling difference between country names in loan data and county names in ppp data

ppp.reset_index(inplace=True)

#set all countries names to lower case 
loan['COUNTRY_NAME'] = [c.lower() for c in loan['COUNTRY_NAME']]
ppp['Country Name'] = [c.lower() for c in ppp['Country Name']]

#returns list of countries that are not matches between the data sets
for c in set(loan['COUNTRY_NAME']):
    match_tup = (c, difflib.get_close_matches(c, ppp['Country Name'], cutoff=1))
    if match_tup[1] == []:
        print(match_tup[0])
        



palestine
egypt
kyrgyzstan
the democratic republic of the congo
lao people's democratic republic
yemen
congo
myanmar (burma)


In [8]:
#Only 8 out of 82 countries are not matches. Most of the remaining difference are due to political
#differences in naming, rather than failure to do a fuzzy match. 



ppp = ppp.rename(index={'west bank and gaza':'palestine', 'congo, rep':'congo', 'myanmar ':'myanmar (burma)',
'yemen, rep.':'yemen', 'lao pdr':'lao people''s democratic republic',
'congo, dem. rep.':'the democratic republic of the congo', 'egypt arab rep.': 'egypt'})

In [9]:
ppp.set_index('Country Name', inplace=True)
ppp.index

Index(['aruba', 'afghanistan', 'angola', 'albania', 'andorra', 'arab world',
       'united arab emirates', 'argentina', 'armenia', 'american samoa',
       ...
       'virgin islands (u.s.)', 'vietnam', 'vanuatu', 'world', 'samoa',
       'kosovo', 'yemen, rep.', 'south africa', 'zambia', 'zimbabwe'],
      dtype='object', name='Country Name', length=264)

In [10]:
    
holder = []
for c,y in zip(loan['COUNTRY_NAME'], loan['loan_year']):
    holder.append([c,y])




In [11]:
for i in range(len(holder)):
    if holder[i][0] not in ppp.index:
        holder[i].append('Country not found')
    elif np.isnan(ppp.loc[holder[i][0], str(holder[i][1])]):
        if (ppp.loc[holder[i][0], str(holder[i][1]-1)]):
            holder[i].append(ppp.loc[holder[i][0], str(holder[i][1]-1)])
        elif (ppp.loc[holder[i][0], str(holder[i][1]-2)]):
            holder[i].append(ppp.loc[holder[i][0], str(holder[i][1]-2)])
        else:    
            holder[i].append('No ppp found')
    else:
        holder[i].append(ppp.loc[holder[i][0],str(holder[i][1])])
        



In [12]:
holder

[['ecuador', 2015, 0.580266058444977],
 ['philippines', 2014, 19.0678634643555],
 ['peru', 2012, 1.57229578495026],
 ['philippines', 2015, 19.0387077331543],
 ['tajikistan', 2009, nan],
 ['samoa', 2013, 1.88389462667626],
 ['togo', 2009, 236.108529309084],
 ['philippines', 2014, 19.0678634643555],
 ['pakistan', 2015, 31.3673839569092],
 ['cambodia', 2018, 1489.02175863748],
 ['togo', 2009, 236.108529309084],
 ['philippines', 2017, 19.3925170898438],
 ['cambodia', 2017, 1488.79821777344],
 ['armenia', 2011, 165.629043579102],
 ['kyrgyzstan', 2015, 'Country not found'],
 ['tajikistan', 2017, 2.59482836723328],
 ['dominican republic', 2013, 22.0110950469971],
 ['rwanda', 2014, 265.946411132813],
 ['philippines', 2017, 19.3925170898438],
 ['tanzania', 2018, 762.485617352315],
 ['philippines', 2018, 19.9166965016587],
 ['kenya', 2012, 36.2744140625],
 ['philippines', 2020, 20.0473853235995],
 ['moldova', 2020, 6.61086692866803],
 ['kosovo', 2014, 0.364741352525478],
 ['nicaragua', 2013, 10.

In [None]:
#merging ppp data for years where Kiva loans were active (2006-2019)
loan = loan.merge(how='inner', left_on='COUNTRY_NAME', right_on ='Country Name',
                  right=ppp[['2006', '2007', '2008', '2009', '2010', '2011',
                            '2012', '2013', '2014', '2015', '2016','2017', '2018', '2019']])


In [None]:

def rel_pp(loan_year):
    if loan_year = 2020:
        return ppp.at[]


        