In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Cleaning Exercise

In this exercise, you'll be practicing the art of data cleaning which in many ways is one of the most important parts of working with real data. Here, you're tasked with replicating a balance table from a development economics paper. There is no one way to do this, but some ways may be more efficient/easier to implement than others. 

You can find the table to replicate under Q2 of problem set 2.

In [2]:
df = pd.read_stata('dataset_savings.dta')

In [5]:
df.columns

Index(['id', 'wave1', 'wave2', 'wave3', 'treatment',
       'not_traced_account_opening', 'inlogs', 'filled_log', 'bg_gender',
       'bg_boda', 'bg_malevendor', 'bg_femalevendor', 'bg_married',
       'bg_num_children', 'bg_age', 'bg_kis_read', 'bg_kis_write',
       'bg_rosca_contrib_lyr', 'bg_educ', 'bg_rosca', 'bg_animalsvalue',
       'bg_durvalue_hh', 'bg_totalinc_lastweek', 'bg_loan_bank',
       'bg_loan_friend', 'bg_healthstatus', 'per_hard_save',
       'per_invest_choice2', 'per_somewhat_patient', 'per_time_consistent',
       'per_hyperbolic', 'per_pat_now_impat_later', 'per_maximpat',
       'per_fwd_digit_score2', 'per_ravens_matrix', 'total_dep_savings',
       'num_trans_savings', 'first6_num_trans_savings', 'num_dep_savings',
       'first6_dep_savings', 'num_wd_savings', 'first6_wd_savings',
       'mean_dep_b', 'median_dep_b', 'mean_wd_b', 'median_wd_b',
       'total_dep_shares', 'first6_dep_shares', 'total_dep_loan',
       'first6_dep_loan', 'total_wd_loan', 'firs

In [None]:
mutate(active = ifelse(first6_num_trans_savings > 1, TRUE, FALSE),
         treatment_bg_boda = treatment*bg_boda,
         active_bg_boda = active*bg_boda,
         bg_boda_wave2 = (bg_boda & wave2),
         bg_malevendor_wave2 = (bg_malevendor & wave2),
         bg_malevendor_wave3 = (bg_malevendor & wave3),
         treament_bg_malevendor = treatment*bg_malevendor,
         active_bg_malevendor = active*bg_malevendor,
         literate_swahili = ifelse((bg_kis_read == 1 & bg_kis_write ==1),
                                   1,
                                   NA),
         literate_swahili = ifelse((bg_kis_read ==0 | bg_kis_write == 0), 
                                   0,
                                   literate_swahili))

In [17]:
# preliminary code to get you started/add variables
savings_df = df.copy()
savings_df['active'] = np.where(savings_df['first6_num_trans_savings'] > 1,1,0)
savings_df['literate_swahili'] = np.where((savings_df['bg_kis_read'].to_numpy() == 1) & 
                                           (savings_df['bg_kis_write'].to_numpy() == 1),
                                          1,pd.NA)
savings_df['literate_swahili'] = np.where((savings_df['bg_kis_read'].to_numpy() == 0) | 
                                           (savings_df['bg_kis_write'].to_numpy() == 0),
                                          0,savings_df['literate_swahili'])

array([False,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False,  True, False, False,  True,
        True, False,

0    0
1    0
2    0
3    0
4    0
Name: active, dtype: int32