In [1]:
from collections import Counter
import random
import pandas as pd
import numpy as np
import itertools

In [2]:

def get_data(base_path='BaselineData', connect_type="filesystem"):
    if connect_type == "filesystem":
        print('Loading Data from File System...')
        data_corpus['businessdetails'] = pd.read_csv(base_path+'/Form 2/businessdetails.csv')
        data_corpus['businessidentity'] = pd.read_csv(base_path+'/Form 2/businessidentity.csv')
        data_corpus['manpowerengaged'] = pd.read_csv(base_path+'/Form 2/manpowerengaged.csv')
        data_corpus['business_result'] = data_corpus['businessdetails'].merge(data_corpus['businessidentity'], how='inner')
        
        data_corpus['household'] = pd.read_csv(base_path+'/Form 1/household.csv', low_memory=False)
        data_corpus['household_member'] = pd.read_csv(base_path+'/Form 1/household_member.csv')
        data_corpus['self_employment_seekers'] = pd.read_csv(base_path+'/Form 1/self_employment_seekers.csv')
        data_corpus['unregisteredactivities'] = pd.read_csv(base_path+'/Form 1/unregisteredactivities.csv')
        
        #data_corpus['peur_result'] = data_corpus['household_member'].merge(data_corpus['unregisteredactivities'], how='inner', left_on='uniqueid', right_on='memberid')
        #data_corpus['peu_result'] = data_corpus['household_member'].merge(data_corpus['self_employment_seekers'], how='inner', left_on='uniqueid', right_on='memberid')
        #data_corpus['pee_result'] = data_corpus['household_member'].merge(data_corpus['self_employment_seekers'], how='inner', left_on='uniqueid', right_on='memberid')
        #data_corpus['pee_result'] = data_corpus['pee_result'][data_corpus['pee_result']['pecategory']=='PEE']
        #data_corpus['peu_result'] = data_corpus['peu_result'][data_corpus['peu_result']['pecategory']=='PEU']
        #data_corpus['peur_result'] = data_corpus['peur_result'][data_corpus['peur_result']['pecategory']=='PEUR']

        data_corpus['hoh_member'] = data_corpus['household_member'][data_corpus['household_member']['relationwithhoh'] == 'Self'].copy()
        data_corpus['hoh_member']['trimmed_uniqueid'] = data_corpus['hoh_member']['uniqueidofmember'].str[:13] 
        data_corpus['hoh_result'] = data_corpus['hoh_member'].merge(data_corpus['household'], how='inner', left_on='trimmed_uniqueid', right_on='uniqueidofhousehold')


        #the datasheets finally used
        data_corpus['individual_member'] = data_corpus['household_member'].copy()
        data_corpus['individual_member']['trimmed_uniqueid'] = data_corpus['individual_member']['uniqueidofmember'].str[:13] 
        data_corpus['individual_member_result'] = data_corpus['individual_member'].merge(data_corpus['household'], how='inner', left_on='trimmed_uniqueid', right_on='uniqueidofhousehold')



        data_corpus['peur'] = data_corpus['individual_member_result'].merge(data_corpus['unregisteredactivities'], how='inner', left_on='uniqueid_x', right_on='memberid')
        data_corpus['peur'] = data_corpus['peur'][data_corpus['peur']['pecategory']=='PEUR']

        data_corpus['pee'] = data_corpus['individual_member_result'].merge(data_corpus['self_employment_seekers'], how='inner', left_on='uniqueid_x', right_on='memberid')
        data_corpus['pee'] = data_corpus['pee'][data_corpus['pee']['pecategory']=='PEE']

        data_corpus['peu'] = data_corpus['individual_member_result'].merge(data_corpus['self_employment_seekers'], how='inner', left_on='uniqueid_x', right_on='memberid')
        data_corpus['peu'] = data_corpus['peu'][data_corpus['peu']['pecategory']=='PEU']
        
        print(data_corpus['peur'].shape[0],data_corpus['pee'].shape[0], data_corpus['peu'].shape[0] )


        print('Loading Data: Successful!')
    elif connect_type == "google-drive":
        print('Loading Data from Google Drive!')
        data_corpus['gid_businessidentity'] = '1Fvrds513yknjDtqizSfk60JB_W1G1yjd'
        data_corpus['gid_businessdetails'] = '1ilaexFPOYPHXYNdXRppQh2VEVI9qZv4N'
        data_corpus['gid_manpowerengaged'] = '1rBzz6kNzpBQwvWAj93NXm1h4kaUgXPOd'

        data_corpus['url_businessidentity'] = f"https://drive.google.com/uc?id={data_corpus['gid_businessidentity']}&export=download"
        data_corpus['url_businessdetails'] = f"https://drive.google.com/uc?id={data_corpus['gid_businessdetails']}&export=download"
        data_corpus['url_manpowerengaged'] = f"https://drive.google.com/uc?id={data_corpus['gid_manpowerengaged']}&export=download"

        data_corpus['businessdetails'] = pd.read_csv(data_corpus['url_businessdetails'])
        data_corpus['businessidentity'] = pd.read_csv(data_corpus['url_businessidentity'])  #https://drive.google.com/file/d/1Fvrds513yknjDtqizSfk60JB_W1G1yjd/view?usp=sharing
        data_corpus['manpowerengaged'] = pd.read_csv(data_corpus['url_manpowerengaged'])

        #household
        data_corpus['gid_household'] = '1lD5OvSg9rAvPJR-kas_vn-wn1OatDr_L'
        data_corpus['gid_householdmember'] = '1zA5LjofLFsTsaKJbBKsVhyY8bYs3Pou9'
        data_corpus['gid_self_employment_seekers'] = '1AB8hDJYZy7wQdRX_AKZdZIgiePtkd0d_'
        data_corpus['gid_unregisteredactivities'] = '1Art08hVQ78krap0AWt32RKt37P6tymmX'

        data_corpus['url_household'] = f"https://drive.google.com/uc?id={data_corpus['gid_household']}&export=download"
        data_corpus['url_householdmember'] = f"https://drive.google.com/uc?id={data_corpus['gid_householdmember']}&export=download"
        data_corpus['url_self_employment_seekers'] = f"https://drive.google.com/uc?id={data_corpus['gid_self_employment_seekers']}&export=download"
        data_corpus['url_unregisteredactivities'] = f"https://drive.google.com/uc?id={data_corpus['gid_unregisteredactivities']}&export=download"



        data_corpus['household'] = pd.read_csv(data_corpus['url_household'])
        data_corpus['household_member'] = pd.read_csv(data_corpus['url_householdmember'])  #https://drive.google.com/file/d/1Fvrds513yknjDtqizSfk60JB_W1G1yjd/view?usp=sharing
        data_corpus['self_employment_seekers'] = pd.read_csv(data_corpus['url_self_employment_seekers'])
        data_corpus['unregisteredactivities'] = pd.read_csv(data_corpus['url_unregisteredactivities'])
       
        data_corpus['business_result'] = data_corpus['businessdetails'].merge(data_corpus['businessidentity'], how='inner')
        data_corpus['peur_result'] = data_corpus['household_member'].merge(data_corpus['unregisteredactivities'], how='inner', left_on='uniqueid', right_on='memberid')
        data_corpus['peu_result'] = data_corpus['household_member'].merge(data_corpus['self_employment_seekers'], how='inner', left_on='uniqueid', right_on='memberid')
        data_corpus['pee_result'] = data_corpus['household_member'].merge(data_corpus['self_employment_seekers'], how='inner', left_on='uniqueid', right_on='memberid')
        data_corpus['pee_result'] = data_corpus['pee_result'][data_corpus['pee_result']['pecategory']=='PEE']
        data_corpus['peu_result'] = data_corpus['peu_result'][data_corpus['peu_result']['pecategory']=='PEU']
        data_corpus['peur_result'] = data_corpus['peur_result'][data_corpus['peur_result']['pecategory']=='PEUR']
        print('Loading Data: Successful!')

        ### BUSINESS IDENTITY = https://drive.google.com/file/d/1Fvrds513yknjDtqizSfk60JB_W1G1yjd/view?usp=sharing
        ### MANPOWERENGAGED = https://drive.google.com/file/d/1rBzz6kNzpBQwvWAj93NXm1h4kaUgXPOd/view?usp=sharing
        ### BUSINESS DETAILS = https://drive.google.com/file/d/1ilaexFPOYPHXYNdXRppQh2VEVI9qZv4N/view?usp=sharing
        ###HOUSEHOLD = https://drive.google.com/file/d/1lD5OvSg9rAvPJR-kas_vn-wn1OatDr_L/view?usp=sharing
        ### HOUSEHOLD MEMBER = https://drive.google.com/file/d/1zA5LjofLFsTsaKJbBKsVhyY8bYs3Pou9/view?usp=sharing
        ### SELFWMPLOYEMENT SEEKERS = https://drive.google.com/file/d/1AB8hDJYZy7wQdRX_AKZdZIgiePtkd0d_/view?usp=sharing
        ### UNREGISTERED_ACTIVITIES = https://drive.google.com/file/d/1Art08hVQ78krap0AWt32RKt37P6tymmX/view?usp=sharing




data_corpus = dict()

load_subset = True
if load_subset == True:
    base_path = 'dataset/BaselineDataTest'
else:
    base_path = 'dataset/BaselineData'  

get_data(base_path=base_path)

Loading Data from File System...
4053 9797 14492
Loading Data: Successful!


In [None]:
data_corpus['peur'] = data_corpus['hoh_member'].merge(data_corpus['household'], how='inner', left_on='trimmed_uniqueid', right_on='uniqueidofhousehold')





In [41]:
filtered_df = data_corpus['individual_member_result'][0:100]

In [54]:
df_hoh_filtered = data_corpus['hoh_result'][data_corpus['hoh_result']['uniqueid_x'].isin(filtered_df['uniqueid_x'])]
df_ilp_filtered = data_corpus['individual_member_result'][data_corpus['individual_member_result']['uniqueidofhousehold'].isin(filtered_df['uniqueidofhousehold'])]
df_peur_filtered = data_corpus['peur'][data_corpus['peur']['uniqueid_x'].isin(filtered_df['uniqueid_x'])]
df_pee_filtered = data_corpus['pee'][data_corpus['pee']['uniqueid_x'].isin(filtered_df['uniqueid_x'])]
df_peu_filtered = data_corpus['peu'][data_corpus['peu']['uniqueid_x'].isin(filtered_df['uniqueid_x'])]

In [61]:
df_peu_filtered.shape[0]

5

In [39]:
for c in list(data_corpus['hoh_result'].columns):
    if c in list(filtered_df.columns):
        print(c)

uniqueid_x
age
annualincome
changepresentwork
educationlevel
employmentstatus
gender
lastname
middlename
name
pecategory
pursuinghighereducation
registeredonudyamportal
relationwithhoh
specialstatus
technicaleducationskill
uniqueidofmember
householdid
trimmed_uniqueid
uniqueid_y
addressoftheenterprise
agriculturelandpossession
annualhouseholdincome
applicationstatus
blockid
cdblockulbmc
district
districtid
geom
headofhousehold
householdsize
householdtype
imagepath
lastnamehead
latitude
locationoftheenterprise
longitude
middlenamehead
observationbyfieldinvestigator
panchayatid
panchayatward
rationcardnumber
residentialtype
responseoffamilyenterprise
schedulenumber
sectoroffamilybusiness
serialnumberofhousehold
socialgroup
surveydonedate
surveystatus
surveyorid
typeoffamilyenterprise
typeofrationcardholder
uniqueidofhousehold
villageufsblock
appversion
surveystatusverifiedby
surveystatusverifieddate
updateddate
headofhouseholdmobilenumber
remarks
recorduploadedmode


In [40]:
data_corpus['hoh_result']['uniqueidofhousehold']

0         RKIS090080007
1         RBAR260210002
2         RPUL020010004
3         RJAM010040003
4         RPUL070050001
              ...      
133120    RRAM090040081
133121    RANA160030034
133122    RUDH170470068
133123    RKUP080040053
133124    RJAM070030008
Name: uniqueidofhousehold, Length: 133125, dtype: object

In [20]:
data_corpus['peur']['uniqueid_x']

0       349355
1       349558
2          629
3       349729
4          658
         ...  
4089    643810
4090    643828
4091    644231
4092    644434
4093    644891
Name: uniqueid_x, Length: 4053, dtype: int64

In [16]:
data_corpus['hoh_result'][data_corpus['hoh_result']['uniqueid_x'].isin(temp['uniqueid_x'])]




Unnamed: 0,uniqueid_x,age,annualincome,changepresentwork,educationlevel,employmentstatus,gender,lastname,middlename,name,...,typeofrationcardholder,uniqueidofhousehold,villageufsblock,appversion,surveystatusverifiedby,surveystatusverifieddate,updateddate,headofhouseholdmobilenumber,remarks,recorduploadedmode
38,115,32Y 0M,Below 2 lakh,Yes,Secondary,Working as casual wage labour,Male,KUMAR,,RAVI,...,PHH- Priority House hold,RKAT120180002,SAHAR,1.0.0,,,,7298355000.0,,
175,349753,37Y 5M,Below 2 lakh,Yes,Secondary,Working as casual wage labour,Male,SHAH,AHMAD,IRSHAD,...,PHH- Priority House hold,RGAN010030019,BABAWAYIL,1.0.0,,,,8082120000.0,,
212,350103,33Y 0M,Below 2 lakh,Yes,Graduate,Working as helper in Household enterprise (unp...,Male,AHMAD,,MEHMOOD,...,PHH- Priority House hold,RKUP220030027,CHAMKOTE,1.0.0,,,,,,
491,350921,30Y 2M,Below 2 lakh,Yes,Secondary,Working as helper in Household enterprise (unp...,Male,BHAT,AHMAD,IMTIYAZ,...,PHH- Priority House hold,RKUP210090059,MANZGAM,1.0.0,,,,9797983000.0,,
533,351362,35Y 0M,No Annual Income,Yes,Postgraduate and above,Working as helper in Household enterprise (unp...,Male,LONE,AHMAD,ZAHOOR,...,AAY – Antyodhya Anna Yojana,RKUL030170005,WARIPORA,1.0.0,,,,9103283000.0,,
572,2530,39Y 0M,Below 2 lakh,Yes,Higher secondary,Working as regular salaried/ wage employee in ...,Male,C,B,A,...,NPHH- Non Priority Household,UANA050090001,,1.0.0,,,,9622739000.0,,
618,351504,32Y 0M,No Annual Income,Yes,Secondary,Working as helper in Household enterprise (unp...,Male,PAYER,RAJA,ALTAF,...,PHH- Priority House hold,RKUP160020016,BOWAN,1.0.0,,,,,,
666,351699,37Y 0M,Below 2 lakh,Yes,Secondary,Working as casual wage labour,Male,KUMAR,,KULDEEP,...,No ration card,RSAM050030008,MADOON,1.0.0,,,,6006783000.0,,
805,352302,29Y 0M,Below 2 lakh,Yes,Higher secondary,Working as helper in Household enterprise (unp...,Male,NAWAZ,,SHAH,...,PHH- Priority House hold,RREA070040035,BATHOIE,1.0.0,,,,8082641000.0,,
876,424794,35Y 3M,Below 2 lakh,Yes,Higher secondary,Working as casual wage labour,Male,BHAT,MUSTAFA,GH,...,PHH- Priority House hold,RKUP010080009,DRUGMULLA,1.0.0,,,,9541168000.0,,


In [66]:
filtered_df = data_corpus['hoh_result']

In [67]:
filtered_df

Unnamed: 0,uniqueid_x,age,annualincome,changepresentwork,educationlevel,employmentstatus,gender,lastname,middlename,name,...,typeofrationcardholder,uniqueidofhousehold,villageufsblock,appversion,surveystatusverifiedby,surveystatusverifieddate,updateddate,headofhouseholdmobilenumber,remarks,recorduploadedmode
0,349185,45Y 0M,Above 2- 4 lakh,No,Secondary,Working as regular salaried/ wage employee / c...,Female,MALA,,ROOP,...,NPHH- Non Priority Household,RKIS090080007,MASSU WARD C,1.0.0,,,,8.082105e+09,,
1,8,50Y 0M,Below 2 lakh,No,Higher secondary,Working in Household enterprise (self-employed...,Male,DAR,AHMAD,SAJAD,...,PHH- Priority House hold,RBAR260210002,WARPORA,1.0.0,,,,7.006241e+09,,
2,15096,65Y 0M,No Annual Income,No,Illiterate,Working as helper in Household enterprise (unp...,Male,LONE,MOHIDIN,GH,...,No ration card,RPUL020010004,BARSOO,1.0.0,,,,9.906839e+09,,
3,15173,49Y 5M,No Annual Income,No,Illiterate,Working as helper in Household enterprise (unp...,Female,DEVI,,CHANCHAL,...,AAY – Antyodhya Anna Yojana,RJAM010040003,BOMAL,1.0.0,,,,,,
4,26,46Y 6M,Above 6-8 lakh,No,Postgraduate and above,Working as regular salaried/ wage employee / c...,Male,BHAT,AHMAD,SHABIR,...,NPHH- Non Priority Household,RPUL070050001,HUNIPORA,1.0.0,,,,7.006709e+09,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133120,644856,63Y 9M,Above 2- 4 lakh,No,Secondary,"Rentiers, pensioners, remittance recipients, etc.",Male,KATOCH,DASS,HARI,...,NPHH- Non Priority Household,RRAM090040081,SILLY,1.0.0,,,,8.492036e+09,,
133121,644891,40Y 0M,Below 2 lakh,No,Illiterate,Self-employed in any unregistered activity,Male,GANIE,AHMAD,ISHFAQ,...,PHH- Priority House hold,RANA160030034,FURRAH,1.0.0,,,,9.906788e+09,,
133122,644899,75Y 0M,No Annual Income,No,Illiterate,Attending domestic duties/or and also engaged ...,Male,RAJ,,DES,...,PHH- Priority House hold,RUDH170470068,NARORE,1.0.1,,,,8.493956e+09,,
133123,644871,56Y 8M,No Annual Income,No,Illiterate,Working as helper in Household enterprise (unp...,Female,BEGUM,,ZAREEFA,...,PHH- Priority House hold,RKUP080040053,BUNGAM,1.0.0,,,,9.541665e+09,,


In [68]:
data_corpus['individual_member_result']

Unnamed: 0,uniqueid_x,age,annualincome,changepresentwork,educationlevel,employmentstatus,gender,lastname,middlename,name,...,typeofrationcardholder,uniqueidofhousehold,villageufsblock,appversion,surveystatusverifiedby,surveystatusverifieddate,updateddate,headofhouseholdmobilenumber,remarks,recorduploadedmode
0,349185,45Y 0M,Above 2- 4 lakh,No,Secondary,Working as regular salaried/ wage employee / c...,Female,MALA,,ROOP,...,NPHH- Non Priority Household,RKIS090080007,MASSU WARD C,1.0.0,,,,8.082105e+09,,
1,349186,19Y 0M,No Annual Income,No,Higher secondary,Attending educational institution,Female,PARIHAR,,SURBHI,...,NPHH- Non Priority Household,RKIS090080007,MASSU WARD C,1.0.0,,,,8.082105e+09,,
2,349187,16Y 9M,No Annual Income,No,Secondary,Attending educational institution,Male,PARIHAR,,ASHISH,...,NPHH- Non Priority Household,RKIS090080007,MASSU WARD C,1.0.0,,,,8.082105e+09,,
3,349188,15Y 9M,No Annual Income,No,Middle,Attending educational institution,Male,PARIHAR,,AYUSH,...,NPHH- Non Priority Household,RKIS090080007,MASSU WARD C,1.0.0,,,,8.082105e+09,,
4,15063,45Y 0M,No Annual Income,No,Illiterate,Working as helper in Household enterprise (unp...,Female,BEGIUM,,SHAMEEMA,...,PHH- Priority House hold,RBAR050080020,PINJOORA,1.0.0,,,,8.493073e+09,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643374,644880,25Y 0M,Below 2 lakh,Yes,Middle,Working as casual wage labour,Male,SINGH,,SWARN,...,PHH- Priority House hold,RJAM070030008,CHHURTA,1.0.0,,,,9.541854e+09,,
643375,644881,23Y 0M,No Annual Income,No,Graduate,Attending educational institution,Female,DEVI,,SUNEHA,...,PHH- Priority House hold,RJAM070030008,CHHURTA,1.0.0,,,,9.541854e+09,,
643376,644882,20Y 0M,No Annual Income,No,Secondary,Attending educational institution,Female,DEVI,,SAPNA,...,PHH- Priority House hold,RJAM070030008,CHHURTA,1.0.0,,,,9.541854e+09,,
643377,644883,7Y 0M,,,Below primary,,Male,SINGH,,KARTIK,...,PHH- Priority House hold,RJAM070030008,CHHURTA,1.0.0,,,,9.541854e+09,,
