In [1]:
# Dependencies and set up
import pandas as pd
from sqlalchemy import create_engine

### Create a DataFrame for Kiva loans and clean it

In [2]:
# Store CSV into DataFrame
kiva_loans_csv = "Resources/kiva_loans.csv"
kiva_loans_df = pd.read_csv(kiva_loans_csv)

In [3]:
# Delete duplicates
kiva_loans_df.drop_duplicates(keep="first",inplace=True)

In [4]:
# Rename "id" column to "loan_id" for clarity 
kiva_loans_df.rename(index=str, columns = {"id":"loan_id","sector":"loan_sector"},inplace=True)

In [5]:
# Convert dates from string to datetime format
kiva_loans_df["posted_time"]=pd.to_datetime(kiva_loans_df["posted_time"])
kiva_loans_df["disbursed_time"]=pd.to_datetime(kiva_loans_df["disbursed_time"])
kiva_loans_df["funded_time"]=pd.to_datetime(kiva_loans_df["funded_time"])
kiva_loans_df["date"]=pd.to_datetime(kiva_loans_df["date"])

In [6]:
# Create separate columns for the number of female, male, and total borrowers
kiva_loans_df["num_fem_borrowers"] = kiva_loans_df["borrower_genders"].str.count("female")
kiva_loans_df["num_m_borrowers"] = (kiva_loans_df["borrower_genders"].str.count("male")) - (kiva_loans_df["num_fem_borrowers"])
kiva_loans_df["total_borrowers"] = kiva_loans_df["num_fem_borrowers"] + kiva_loans_df["num_m_borrowers"]

### Create regions DataFrame, clean it, and find a way to link it to the Kiva loans DataFrame

In [7]:
# Store CSV into DataFrame
regions_csv = "Resources/kiva_mpi_region_locations.csv"
regions_df = pd.read_csv(regions_csv)

In [8]:
# Delete duplicates
regions_df.drop_duplicates(keep="first",inplace=True)

In [9]:
# Create a location ID
regions_df.sort_values(["country","region"], ascending = True, inplace = True)
regions_df=regions_df.assign(location_id=(regions_df["country"]+"_"+regions_df["region"]).astype("category").cat.codes)

In [10]:
# Merge the loans and regions DataFrames to make sure we have a way to link them together
loans_regions_df = pd.merge(kiva_loans_df,regions_df,how="left",left_on=["country","region"],right_on=["country","region"])

In [11]:
# Create a "cleaned" DataFrame for Kiva loans with only the columns we need
cleaned_kiva_loans_df = loans_regions_df[["loan_id","funded_amount","loan_amount","activity","loan_sector","use","currency","partner_id","posted_time","disbursed_time","funded_time","term_in_months","lender_count", "repayment_interval","date","num_fem_borrowers","num_m_borrowers","total_borrowers","location_id"]].copy()
cleaned_kiva_loans_df.drop_duplicates(keep="first",inplace=True)

In [12]:
# Rename columns with capital letters, spaces, or are otherwise unclear
regions_df.rename(index=str, columns = {"ISO":"iso","LocationName":"location_name","MPI":"mpi","sector":"loan_sector"},inplace=True)

In [13]:
# Create "cleaned_regions_df" with only the relevant columns from the "regions_df" and drop duplicates
cleaned_regions_df = regions_df[["location_id","iso","country","region","world_region","mpi","lat","lon"]].copy()
cleaned_regions_df.drop_duplicates(["location_id"],keep="first",inplace=True)

### Create loan theme DataFrame, clean it, and find a way to link it to the Kiva loans DataFrame

In [14]:
# Store CSV into DataFrame
theme_csv = "Resources/loan_theme_ids.csv"
theme_df = pd.read_csv(theme_csv)

In [15]:
# Merge the loans and themes DataFrames to make sure we have a way to link them together
loans_themes_df = pd.merge(cleaned_kiva_loans_df,theme_df,how="left",left_on="loan_id",right_on="id")

In [16]:
# Redefine the cleaned Kiva loans DataFrame to include a theme ID to link two dataframes together 
cleaned_kiva_loans_df = loans_themes_df[["loan_id","funded_amount","loan_amount","activity","loan_sector","use","currency","partner_id","posted_time","disbursed_time","funded_time","term_in_months","lender_count", "repayment_interval","date","num_fem_borrowers","num_m_borrowers","total_borrowers","location_id","Loan Theme ID"]].copy()

# Rename "Loan theme ID" column to "theme_id_old" and change "use" column to something that's not already in SQL's vocabulary
cleaned_kiva_loans_df.rename(index=str, columns = {"Loan Theme ID":"theme_id_old","use":"purpose"},inplace=True)

In [17]:
# Rename columns not to have spaces in their titles
theme_df.rename(index=str, columns = {"id":"loan_id","Loan Theme ID":"theme_id_old","Loan Theme Type":"theme_type","Partner ID":"partner_id"},inplace=True)

In [18]:
# Define "cleaned_theme_df" as only the relevant columns and remove duplicates
cleaned_theme_df = theme_df[["theme_id_old","theme_type"]].copy()
cleaned_theme_df.sort_values("theme_id_old",inplace=True,ascending=True)
cleaned_theme_df.drop_duplicates(["theme_id_old"],keep="first",inplace=True)

In [19]:
# The old theme_id relies on case sensitivity. Recreating the theme_id
cleaned_theme_df = cleaned_theme_df.assign(theme_id=(cleaned_theme_df["theme_id_old"]).astype("category").cat.codes)

In [20]:
# Re-merge so to re-define the theme_id and ensure it will link
loans_themes_df = pd.merge(cleaned_kiva_loans_df,cleaned_theme_df,how="left",left_on="theme_id_old",right_on="theme_id_old")

In [21]:
# Redefine the cleaned Kiva loans DataFrame to include a theme ID to link two dataframes together 
cleaned_kiva_loans_df = loans_themes_df[["loan_id","funded_amount","loan_amount","activity","loan_sector","purpose","currency","partner_id","posted_time","disbursed_time","funded_time","term_in_months","lender_count", "repayment_interval","date","num_fem_borrowers","num_m_borrowers","total_borrowers","location_id","theme_id"]].copy()

In [22]:
# Redefine cleaned theme DataFrame with the re-established theme_id
cleaned_theme_df = cleaned_theme_df[["theme_id","theme_type"]].copy()

### Create field partner DataFrame, clean it, and find a way to link it to the Kiva loans DataFrame

In [23]:
# Store CSV into DataFrame
field_partner_csv = "Resources/loan_themes_by_region.csv"
field_partner_df = pd.read_csv(field_partner_csv)

In [24]:
# Rename the columns that have spaces or capital letters
field_partner_df.rename(index=str, columns = {"Partner ID":"partner_id","Field Partner Name":"field_partner_name","Loan Theme ID":"theme_id","Loan Theme Type":"theme_type","ISO":"iso","LocationName":"location_name","sector":"partner_sector"},inplace=True)
field_partner_df = field_partner_df[["partner_id","field_partner_name","partner_sector"]].copy()
field_partner_df.drop_duplicates(keep="first",inplace=True)

In [25]:
# Merge loans and partner dataframes to make sure there aren't partners not attached to loans
loan_partner_df = pd.merge(cleaned_kiva_loans_df,field_partner_df,how="left",left_on="partner_id",right_on="partner_id")

In [26]:
# Create cleaned dataframe with only the relevant columns and drop all duplicates
cleaned_field_partner_df = loan_partner_df[["partner_id","field_partner_name","partner_sector"]].copy()
cleaned_field_partner_df.drop_duplicates(subset=["partner_id"],keep="first",inplace=True)
cleaned_field_partner_df.dropna(subset=["partner_id"],how="any",inplace=True)

### Input data into SQL

In [27]:
# Establish the MySQL database connection and create the engine
rds_connection_string = "root:Farmerdave415-@127.0.0.1/kiva_db"
engine = create_engine(f"mysql+pymysql://{rds_connection_string}?charset=utf8", encoding = "utf8")

In [28]:
# Check that tables are created in MySQL database
engine.table_names()

['airbnb_hosts',
 'field_partners',
 'kiva_loans',
 'listings',
 'loan_regions',
 'loan_themes',
 'property_availability',
 'property_reviews']

In [29]:
# Add the field partner DataFrame to MySQL    
cleaned_field_partner_df.to_sql(name="field_partners",con=engine,if_exists="append",index=False,chunksize=2000)

In [30]:
# Add the theme DataFrame to MySQL
cleaned_theme_df.to_sql(name="loan_themes",con=engine,if_exists="append",index=False)

In [31]:
# Add the regions DataFrame to MySQL
cleaned_regions_df.to_sql(name="loan_regions",con=engine,if_exists="append",index=False,chunksize=2000)

In [32]:
# Add the Kiva loans DataFrame to MySQL
cleaned_kiva_loans_df.to_sql(name="kiva_loans",con=engine,if_exists="append",index=False,chunksize=2000)