In [6]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Read Data

In [11]:
ht = pd.read_csv("Data D4H19 - AHT Sept 2018.csv")
ht = ht.drop(ht.columns[0], axis=1)
ht.head(5)

Unnamed: 0,yearOfRegistration,Datasource,gender,ageBroad,majorityStatus,majorityStatusAtExploit,majorityEntry,citizenship,meansOfControlDebtBondage,meansOfControlTakesEarnings,...,typeOfSexPrivateSexualServices,typeOfSexConcatenated,isAbduction,RecruiterRelationship,CountryOfExploitation,recruiterRelationIntimatePartner,recruiterRelationFriend,recruiterRelationFamily,recruiterRelationOther,recruiterRelationUnknown
0,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1
1,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1
2,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1
3,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1
4,2002,Case Management,-99,-99,-99,-99,-99,-99,-99,-99,...,-99,-99,-99,Unknown,-99,0,0,0,0,1


Some categorical features only have 1 value other than NaN (causing errors). We hold these features for later analysis

In [9]:
ht.describe().iloc[1].index[0]

'yearOfRegistration'

In [12]:
uncertain_features = []

for idx,column in enumerate(ht.describe().columns):
    if ht.describe().iloc[1,idx] == 1:
        uncertain_features.append(column)
        
print("List of features completed! {} features added".format(len(uncertain_features)))

List of features completed! 0 features added


## Data Quality Report - Categorical Features

In [14]:
ht = ht.replace('-99', np.NaN)
ht = ht.replace(-99, np.NaN)
ht = ht.astype('object')

ht_temp = ht[[col for col in ht.columns if col not in uncertain_features]]
ht_cat = ht_temp.select_dtypes('object')

In [15]:
def Qual_Stats(df):
    columns = df.columns   
    report = []
    
    for column in columns:     
        name = column
        count = df.shape[0]
        missing_percent = (df[column].isnull().values.sum())/count
        cardinality = df[column].nunique()
        mode = df[column].value_counts().index[0]
        mode_freq = df[column].value_counts().values[0]
        mode_percent = mode_freq/count
        mode_2 = df[column].value_counts().index[1]
        mode_2_freq = df[column].value_counts().values[1]
        mode_2_percent = mode_2_freq/count
        
        
        row = {
                'Feature': name,
                'Count': count,
                'Missing %': missing_percent, 
                'Card.': cardinality, 
                'Mode': mode,
                'Mode Freq.': mode_freq,
                'Mode %': mode_percent,
                '2nd Mode': mode_2,
                '2nd Mode Freq.': mode_2_freq,
                '2nd Mode %': mode_2_percent
                }
        
        report.append(row)
    
    return pd.DataFrame(report, columns = row.keys()).sort_values(by=['Missing %'], axis=0, ascending=False).reset_index(drop = True)

Qual_Stats(ht_cat)

IndexError: index 1 is out of bounds for axis 0 with size 1

## Features with High Uncertainty

The nature of these features, with respect to the project, is as follows:

* When a victim (or a proxy for a victim) makes a report, they will either mention or not mention these features. The agent recieving this information does not probe further for information using standardized questionnairs. This results in either a '1' (applicable to victim) or '-99' (uncertain); rarely a '0' (does not apply to victim)


* Given the above, these features will only have one (1) mode


* Missing percentages signify the percentage of cases that had uncertainty with regard to the feature.

In [150]:
uncertain_features

['meansOfControlRestrictsFinancialAccess',
 'meansOfControlUsesChildren',
 'isForcedMilitary',
 'isOrganRemoval',
 'typeOfLabourIllicitActivities',
 'typeOfLabourMiningOrDrilling',
 'typeOfLabourTransportation',
 'typeOfSexRemoteInteractiveServices']

In [146]:
def Qual_Stats(df):
    columns = unkown_features  
    report = []
    
    for column in columns:     
        name = column
        count = df.shape[0]
        missing_percent = (df[column].isnull().values.sum())/count
        cardinality = df[column].nunique()
        mode = df[column].value_counts().index[0]
        mode_freq = df[column].value_counts().values[0]
        mode_percent = mode_freq/count       
        
        row = {
                'Feature': name,
                'Count': count,
                'Missing %': missing_percent, 
                'Card.': cardinality, 
                'Mode': mode,
                'Mode Freq.': mode_freq,
                'Mode %': mode_percent,
                }
        
        report.append(row)
    
    return pd.DataFrame(report, columns = row.keys()).sort_values(by=['Missing %'], axis=0, ascending=False).reset_index(drop = True)

Qual_Stats(ht)

Unnamed: 0,Feature,Count,Missing %,Card.,Mode,Mode Freq.,Mode %
0,meansOfControlUsesChildren,55434,0.997005,1,1.0,166,0.002995
1,meansOfControlRestrictsFinancialAccess,55434,0.996807,1,1.0,177,0.003193
2,typeOfSexRemoteInteractiveServices,55434,0.83263,1,0.0,9278,0.16737
3,isForcedMilitary,55434,0.783184,1,0.0,12019,0.216816
4,isOrganRemoval,55434,0.783184,1,0.0,12019,0.216816
5,typeOfLabourIllicitActivities,55434,0.550384,1,0.0,24924,0.449616
6,typeOfLabourMiningOrDrilling,55434,0.548977,1,0.0,25002,0.451023
7,typeOfLabourTransportation,55434,0.548977,1,0.0,25002,0.451023
