In [44]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

### Read Data

In [45]:
ht = pd.read_csv("Data D4H19 - AHT Sept 2018.csv")
ht = ht.drop(ht.columns[0], axis=1)
ht.head(5)

Unnamed: 0,yearOfRegistration,Datasource,gender,ageBroad,majorityStatus,majorityStatusAtExploit,majorityEntry,citizenship,meansOfControlDebtBondage,meansOfControlTakesEarnings,...,typeOfSexPrivateSexualServices,typeOfSexConcatenated,isAbduction,RecruiterRelationship,CountryOfExploitation,recruiterRelationIntimatePartner,recruiterRelationFriend,recruiterRelationFamily,recruiterRelationOther,recruiterRelationUnknown
0,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1
1,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1
2,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1
3,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1
4,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1


### Combine indicators into Concatenated versions

In [46]:
def extract_concat(df):
    meansOfControl = [col for col in ht.columns if re.match("meansOfControl",col)]
    recruiterRelation = [col for col in ht.columns if re.match("recruiterRelation",col)]
    typeOfSex = [col for col in ht.columns if re.match("typeOfSex",col)]
    typeOfLabour = [col for col in ht.columns if re.match("typeOfLabour",col)]
    typeOfExploit = [col for col in ht.columns if re.match("is",col) and not col == "isAbduction"]

    columns = []
    columns.extend(meansOfControl+recruiterRelation+typeOfSex+typeOfLabour+typeOfExploit)

    concat = [col for col in df.columns if re.search("Concatenated",col)]
    
    diff = [col for col in columns if col not in concat]
    
    new_df = df.drop(diff,axis=1)
    
    return new_df

concatenated_ht = extract_concat(ht)
concatenated_ht.head(5)

Unnamed: 0,yearOfRegistration,Datasource,gender,ageBroad,majorityStatus,majorityStatusAtExploit,majorityEntry,citizenship,meansOfControlConcatenated,typeOfExploitConcatenated,typeOfLabourConcatenated,typeOfSexConcatenated,isAbduction,RecruiterRelationship,CountryOfExploitation
0,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,Unknown,-99
1,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,Unknown,-99
2,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,Unknown,-99
3,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,Unknown,-99
4,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99,Unknown,-99


### Replace missing values with np.NaN

In [47]:
def missing_val(df):
    
    new_df = df.replace([-99,"-99"], np.NaN)
    
    return new_df

ht_cleaned = missing_val(concatenated_ht)

### Identify presence of multiple indicators

When some columns are concatenated, the values are listed to be a combination of the original indicators 

For example: Debt bondage;Physical Abuse;Psychological abuse;Sexual abuse;Threats;Withholds documents

To account for this, these combined indicators are relisted as "Various"

In [48]:
def combine_indicators(value):
    multiple = "Various"
    
    if re.search(";", str(value)):
        return multiple
    else:
        return value

concat = [col for col in ht_cleaned.columns if re.search("Concatenated",col)]  
for column in concat:
    ht_cleaned[column] = ht_cleaned[column].apply(combine_indicators)
    

In [49]:
# Test to make sure values were replaces

#sum(ht_cleaned.meansOfControlConcatenated == "Various")

In [50]:
ht_cleaned.head()

Unnamed: 0,yearOfRegistration,Datasource,gender,ageBroad,majorityStatus,majorityStatusAtExploit,majorityEntry,citizenship,meansOfControlConcatenated,typeOfExploitConcatenated,typeOfLabourConcatenated,typeOfSexConcatenated,isAbduction,RecruiterRelationship,CountryOfExploitation
0,2002,Case Management,,,,,,,,,,,,Unknown,
1,2002,Case Management,,,,,,,,,,,,Unknown,
2,2002,Case Management,,,,,,,,,,,,Unknown,
3,2002,Case Management,,,,,,,,,,,,Unknown,
4,2002,Case Management,,,,,,,,,,,,Unknown,


In [51]:
ht_cleaned.to_csv("newData.csv", index= False)