# Poverty Prediction Challenge

## Import Data
Load datasets and check dimensions

In [56]:
import pandas as pd
import numpy as np

DATA_DIR = "C:\\Users\\md82\\OneDrive - Anglia Ruskin University\\Documents\\mj-datalab\\poverty-prediction-challenge\\data\\raw"
TRAIN_DATA = DATA_DIR + "\\train_hh_features.csv"
TEST_DATA = DATA_DIR + "\\test_hh_features.csv"
HH_DATA = DATA_DIR + "\\train_hh_gt.csv"
PR_DATA = DATA_DIR + "\\train_rates_gt.csv"

train_data = pd.read_csv(TRAIN_DATA)
test_data = pd.read_csv(TEST_DATA)
hh_data = pd.read_csv(HH_DATA)  
pr_data = pd.read_csv(PR_DATA)

In [4]:
# Check dimensions of data
print("Shape:", train_data.shape)  # (rows, columns)
print("Rows:", len(train_data))
print("Columns:", len(train_data.columns))
train_data.head()

Shape: (104234, 88)
Rows: 104234
Columns: 88


Unnamed: 0,hhid,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,...,consumed4200,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000,survey_id
0,100001,1,75,4,594.80627,Female,1,0,0,0,...,Yes,No,No,No,Yes,Yes,Yes,Yes,No,100000
1,100002,1,150,4,1676.2723,Female,2,0,0,0,...,Yes,No,No,No,No,Yes,Yes,No,No,100000
2,100003,1,375,4,506.93719,Male,5,0,0,2,...,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Yes,100000
3,100004,1,375,4,824.61786,Male,5,0,0,1,...,No,Yes,No,No,No,Yes,Yes,No,No,100000
4,100005,1,525,4,351.47644,Male,7,1,0,0,...,Yes,No,No,Yes,No,Yes,Yes,Yes,No,100000


In [29]:
# Check dimensions of data
print("Shape:", test_data.shape)  # (rows, columns)
print("Rows:", len(test_data))
print("Columns:", len(test_data.columns))
test_data.head()

Shape: (103023, 88)
Rows: 103023
Columns: 88


Unnamed: 0,hhid,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,...,consumed4200,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000,survey_id
0,400001,1,320,4,567.80914,Female,4,0,1,0,...,No,No,No,No,No,Yes,Yes,Yes,No,400000
1,400002,1,480,4,561.70367,Female,6,0,1,1,...,Yes,No,No,No,No,Yes,Yes,Yes,No,400000
2,400003,1,320,4,183.16423,Male,4,0,0,0,...,Yes,No,Yes,No,Yes,Yes,No,Yes,No,400000
3,400004,1,320,4,696.02411,Male,4,0,0,1,...,Yes,Yes,No,No,No,Yes,Yes,No,No,400000
4,400005,1,320,4,286.95731,Male,4,0,0,1,...,Yes,No,No,No,No,No,Yes,No,No,400000


In [5]:
# Check dimensions of data
print("Shape:", hh_data.shape)  # (rows, columns)
print("Rows:", len(hh_data))
print("Columns:", len(hh_data.columns))
hh_data.head()

Shape: (104234, 3)
Rows: 104234
Columns: 3


Unnamed: 0,survey_id,hhid,cons_ppp17
0,100000,100001,25.258402
1,100000,100002,16.996706
2,100000,100003,13.671848
3,100000,100004,7.189475
4,100000,100005,12.308855


In [6]:
# Check dimensions of data
print("Shape:", pr_data.shape)  # (rows, columns)
print("Rows:", len(pr_data))
print("Columns:", len(pr_data.columns))
pr_data.head()

Shape: (3, 20)
Rows: 3
Columns: 20


Unnamed: 0,survey_id,pct_hh_below_3.17,pct_hh_below_3.94,pct_hh_below_4.60,pct_hh_below_5.26,pct_hh_below_5.88,pct_hh_below_6.47,pct_hh_below_7.06,pct_hh_below_7.70,pct_hh_below_8.40,pct_hh_below_9.13,pct_hh_below_9.87,pct_hh_below_10.70,pct_hh_below_11.62,pct_hh_below_12.69,pct_hh_below_14.03,pct_hh_below_15.64,pct_hh_below_17.76,pct_hh_below_20.99,pct_hh_below_27.37
0,100000,0.067364,0.118927,0.169905,0.221865,0.271564,0.319585,0.366329,0.419816,0.471454,0.523798,0.574413,0.623091,0.671263,0.721329,0.773303,0.81977,0.865121,0.909075,0.954239
1,200000,0.059326,0.11156,0.159023,0.211754,0.2631,0.311758,0.356914,0.407631,0.463443,0.512931,0.559361,0.609337,0.659291,0.708043,0.760932,0.809045,0.86035,0.906385,0.952805
2,300000,0.049803,0.100381,0.149502,0.200144,0.250192,0.300211,0.349596,0.39993,0.449845,0.49993,0.550082,0.599926,0.650088,0.699617,0.750341,0.800111,0.850081,0.899974,0.949988


## Check Proverty Rate Calculation

In [65]:
# Get hh survey 3 and weights
hhs3 = hh_data[hh_data['survey_id'] == 300000]['cons_ppp17']
w3 = train_data[train_data['survey_id'] == 300000]['weight']


In [66]:
values = hhs3.to_numpy()
weights = w3.to_numpy()
thresholds = thresholds = [3.17, 3.94, 4.60, 5.26, 5.88, 6.47, 7.06, 7.70, 8.40, 9.13, 9.87, 10.70, 11.62, 12.69, 14.03, 15.64, 17.76, 20.99, 27.37]
thresholds = np.array(thresholds, dtype=float)

# Calculate percentage below each threshold
percentages = [(weights[values < t].sum() / weights.sum()) for t in thresholds]

In [67]:
percentages

[0.04980335690713668,
 0.1003810795861071,
 0.14950154335080076,
 0.20014444988668156,
 0.25019209252172053,
 0.30021080213347995,
 0.34959562948388834,
 0.3999302348248445,
 0.4498449577365705,
 0.49992952764149123,
 0.550081679677303,
 0.5999260224718272,
 0.650088198062994,
 0.6996173523116748,
 0.7503409392440333,
 0.8001114582458962,
 0.8500808187584381,
 0.8999736497333151,
 0.9499877780268293]