### Center for Trafficking Data Analysis Preparation
To begin cleaning the data and prepare for analysis, I will begin by loading the data and identifying the features I seek to predict. 

In [157]:
import polars as pl
df = pl.read_csv('CTDC_global_synthetic_data_v2025.csv')
y_variables = ['isForcedLabour', 'isSexualExploit', 'isOtherExploit']

My inital filter will restrict the data to entries where the country of exploitation is in North America. Also, I will remove 'type..' features from the dataset. 
These features contain more detailed information on the types of labor and sexual exploitation experienced and are therefore extremely confounding in terms of the predictors.

In [158]:
df = (
    df
    .filter(pl.col('CountryOfExploitation').is_in(['USA', 'MEX', 'CAN']))
    .select(pl.exclude([c for c in df.columns if 'type' in c]))
    .drop('yearOfRegistration')
)
print(f"Number of rows in USA before data cleaning: {len(df)}")

Number of rows in USA before data cleaning: 117575


Next, I will remove any rows where there are no 1s for any of the three types of exploitation. 

In [159]:
has_exploit = (
    df['isForcedLabour'].is_not_null() | 
    df['isSexualExploit'].is_not_null() | 
    df['isOtherExploit'].is_not_null()
)

num_with = len(df.filter(has_exploit))
num_without = len(df.filter(~has_exploit))
original_rows = num_with + num_without
print(f"Number of rows with exploitation data: {num_with}")
print(f"Number of rows without exploitation data: {num_without}")
print(f"Original number of rows: {original_rows}")

# Filter to only those rows
df = df.filter(has_exploit)

Number of rows with exploitation data: 91043
Number of rows without exploitation data: 26532
Original number of rows: 117575


Verifying targets are now binary and do not contain nulls:


In [156]:
# verify targets are now binary and do not contain nulls
for exploit in y_variables:
    unique_values = df[exploit].unique()
    print(f"{exploit} unique values: {unique_values.to_list()}")

isForcedLabour unique values: [0, 1]
isSexualExploit unique values: [0, 1]
isOtherExploit unique values: [0, 1]


In [138]:
feature_groups = {
    "means_of_control": [
        "meansDebtBondageEarnings",
        "meansThreats", 
        "meansAbusePsyPhySex",
        "meansFalsePromises",
        "meansDrugsAlcohol",
        "meansDenyBasicNeeds",
        "meansExcessiveWorkHours",
        "meansWithholdDocs"
    ],
    "recruiter_relation": [
        "recruiterRelationIntimatePartner",
        "recruiterRelationFriend",
        "recruiterRelationFamily",
        "recruiterRelationOther"
    ]
}

# encode binary features
binary_cols = [col for cols in feature_groups.values() for col in cols]
df_final = df.with_columns([
    pl.col(col).fill_null(0).cast(pl.Int64)
    for col in binary_cols
])
df_final

gender,ageBroad,citizenship,CountryOfExploitation,traffickMonths,meansDebtBondageEarnings,meansThreats,meansAbusePsyPhySex,meansFalsePromises,meansDrugsAlcohol,meansDenyBasicNeeds,meansExcessiveWorkHours,meansWithholdDocs,isForcedLabour,isSexualExploit,isOtherExploit,recruiterRelationIntimatePartner,recruiterRelationFriend,recruiterRelationFamily,recruiterRelationOther
str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""Man""",,"""MEX""","""USA""",,1,1,1,1,0,1,1,1,1,0,0,0,0,0,1
"""Man""",,"""MEX""","""USA""",,1,1,1,1,0,1,1,1,1,0,0,0,0,0,1
"""Man""",,"""MEX""","""USA""",,1,1,1,1,0,1,1,1,1,0,0,0,0,0,1
"""Man""",,"""MEX""","""USA""",,1,1,1,1,0,1,1,1,1,0,0,0,0,0,1
"""Man""",,"""MEX""","""USA""",,1,1,1,1,0,1,1,1,1,0,0,0,0,0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
,,,"""CAN""",,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
,,,"""CAN""",,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
,,,"""CAN""",,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
,,,"""CAN""",,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [139]:
# print(df_final['gender'].unique().to_list())
df_final = df_final.with_columns(
    pl.col("gender").fill_null("Unknown"),
)
gender_dummies = df_final.select(pl.col("gender")).to_dummies()
df_final = pl.concat([df_final, gender_dummies], how="horizontal")
df_final = df_final.drop(["gender", "gender_Man"]) # dropping gender unknown column

In [140]:
print(df_final['ageBroad'].unique().to_list())
age_mapping = {
    "0--8": 1,
    "09--17": 2,
    "18--20": 3,
    "21--23": 4,
    "24--26": 5,
    "27--29": 6,
    "30--38": 7,
    "39--47": 8,
    "48+": 9,
    "None": 0  # Unknown age
}

df_final = df_final.with_columns(
    pl.col('ageBroad')
    .fill_null("None")
    .replace(age_mapping).cast(pl.Int64)
    .alias('ageBroad')
)
print(df_final['ageBroad'].unique().to_list())

['39--47', '48+', '24--26', None, '27--29', '09--17', '0--8', '18--20', '21--23', '30--38']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [141]:
print(df_final['traffickMonths'].unique().to_list())
duration_mapping = {
    "0--12 (0-1 yr)": 1,
    "13--24 (1-2 yrs)": 2,
    "25+ (2+ yrs)": 3,
    "None": 0  # Unknown duration
}
df_final = df_final.with_columns(
    pl.col("traffickMonths")
    .fill_null("None")
    .replace(duration_mapping).cast(pl.Int64)
    .alias("traffickMonths")
)

print(df_final['traffickMonths'].unique().to_list())

['13--24 (1-2 yrs)', '25+ (2+ yrs)', None, '0--12 (0-1 yr)']
[0, 1, 2, 3]


In [142]:
# print(df_final['citizenship'].unique().to_list())
# is_us_citizen = df_final['citizenship'] == 'USA'

citizenship_in_country = df_final['citizenship'] == df_final['CountryOfExploitation']
df_final = df_final.with_columns(
    pl.when(citizenship_in_country).then(1).otherwise(0).alias('isCitizenOfCountry')
)
df_final = df_final.drop('citizenship', 'CountryOfExploitation')

In [143]:
def remove_none_cases(df):
    df = df.filter(pl.col('combo_code') != '000')
    df = df.drop('combo_code')
    # df = df.drop('exploitation_types')
    return df

def count_exploit_combinations(df):
    df = df.with_columns([
        (
            pl.col('isForcedLabour').cast(pl.String)
            + pl.col('isSexualExploit').cast(pl.String)
            + pl.col('isOtherExploit').cast(pl.String)
        ).alias('combo_code')
    ])
    code_to_label = {
        '100': 'Labour Only',
        '010': 'Sexual Only',
        '001': 'Other Only',
        '000': 'None',
        '110': 'Labour + Sexual',
        '101': 'Labour + Other',
        '011': 'Sexual + Other',
        '111': 'Labour + Sexual + Other',
    }
    # add labels to original rows
    df = df.with_columns(
        pl.col('combo_code').replace(code_to_label).alias('exploitation_types')
    )
    # counts for categories that actually occur
    combo_counts = (
        df
        .group_by('exploitation_types')
        .len()
        .rename({'len': 'count'})
    )

    # full set of labels
    all_labels = [
        'Labour Only',
        'Sexual Only',
        'Other Only',
        'Labour + Sexual',
        'Labour + Other',
        'Sexual + Other',
        'Labour + Sexual + Other',
        'None'

    ]
    # reindex so missing combos appear with 0
    combo_counts = (
        pl.DataFrame({'exploitation_types': all_labels})
        .join(combo_counts, on='exploitation_types', how='left')
        .with_columns(
            pl.col('count').fill_null(0).cast(pl.UInt32)
        )
        .sort('count', descending=True)
    )

    print(combo_counts)
    return df

df_final = count_exploit_combinations(df_final)
df_final = remove_none_cases(df_final)

print(df_final.describe())

shape: (8, 2)
┌─────────────────────────┬───────┐
│ exploitation_types      ┆ count │
│ ---                     ┆ ---   │
│ str                     ┆ u32   │
╞═════════════════════════╪═══════╡
│ Sexual Only             ┆ 67296 │
│ None                    ┆ 26532 │
│ Labour Only             ┆ 18484 │
│ Labour + Sexual         ┆ 5194  │
│ Other Only              ┆ 65    │
│ Labour + Other          ┆ 4     │
│ Sexual + Other          ┆ 0     │
│ Labour + Sexual + Other ┆ 0     │
└─────────────────────────┴───────┘
shape: (9, 23)
┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ statistic ┆ ageBroad ┆ traffickM ┆ meansDebt ┆ … ┆ gender_Un ┆ gender_Wo ┆ isCitizen ┆ exploitat │
│ ---       ┆ ---      ┆ onths     ┆ BondageEa ┆   ┆ known     ┆ man       ┆ OfCountry ┆ ion_types │
│ str       ┆ f64      ┆ ---       ┆ rnings    ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│           ┆          ┆ f64       ┆ ---       ┆   ┆ f64       

In [144]:
df_final.write_csv('final_data.csv')