<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup-and-Overview" data-toc-modified-id="Setup-and-Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup and Overview</a></span></li><li><span><a href="#Reformat-Data-and-Calculate-Required-Parameters" data-toc-modified-id="Reformat-Data-and-Calculate-Required-Parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Reformat Data and Calculate Required Parameters</a></span><ul class="toc-item"><li><span><a href="#Reformat-Dates" data-toc-modified-id="Reformat-Dates-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Reformat Dates</a></span></li><li><span><a href="#Calculate-new-Parameters" data-toc-modified-id="Calculate-new-Parameters-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Calculate new Parameters</a></span><ul class="toc-item"><li><span><a href="#Age" data-toc-modified-id="Age-2.2.1"><span class="toc-item-num">2.2.1&nbsp;&nbsp;</span>Age</a></span></li><li><span><a href="#Convert-inches-and-lbs-to-centimeters-and-kilograms" data-toc-modified-id="Convert-inches-and-lbs-to-centimeters-and-kilograms-2.2.2"><span class="toc-item-num">2.2.2&nbsp;&nbsp;</span>Convert inches and lbs to centimeters and kilograms</a></span></li><li><span><a href="#Calculate-BMI" data-toc-modified-id="Calculate-BMI-2.2.3"><span class="toc-item-num">2.2.3&nbsp;&nbsp;</span>Calculate BMI</a></span></li><li><span><a href="#Sex" data-toc-modified-id="Sex-2.2.4"><span class="toc-item-num">2.2.4&nbsp;&nbsp;</span>Sex</a></span></li></ul></li></ul></li></ul></div>

# Setup and Overview

In [1]:
import pandas as pd
import numpy as np

In [2]:
# set base path and load attributes

base_path = "/Users/andrintschan/SWITCHdrive/MT/inmates/"

df = pd.read_csv(base_path + "person.csv")
df.head()

Unnamed: 0,id,name,Unnamed: 2,Unnamed: 3,date_of_birth,weight,hair,sex,height,race,...,last_paroled_date,projected_discharge_date,parole_date,electronic_detention_date,discharge_date,parent_institution,offender_status,location,sex_offender_registry_required,alias
0,A00147,MCCUTCHEON,,JOHN,06/14/1949,185.0,Brown,Male,67.0,White,...,,10.06.36,,,,DIXON CORRECTIONAL CENTER,IN CUSTODY,DIXON,True,
1,A00220,WALKER,,ISIAH,03/30/1957,155.0,Black,Male,73.0,Black,...,,,,,,STATEVILLE CORRECTIONAL CENTER,NON-IDOC CUSTODY,ILL/OTH STATE/FED CONCURR,,
2,A00360,BELL,,HOWARD,12/18/1946,167.0,Gray or Partially Gray,Male,69.0,White,...,,TO BE DETERMINED,10.02.17,,,PINCKNEYVILLE CORRECTIONAL CENTER,PAROLE,PAROLE DISTRICT 1,,HOWARD R BELL | DONALD BROADSTONE | RONALD B...
3,A00367,GARVIN,,RAYMOND,01.12.54,245.0,Black,Male,72.0,Black,...,,11/20/2020,,,,WESTERN ILLINOIS CORRECTIONAL CENTER,IN CUSTODY,WESTERN ILLINOIS,,
4,A01054,TIPTON,,DARNELL,03/25/1954,166.0,Salt and Pepper,Male,67.0,Black,...,,08/14/2068,,,,MENARD CORRECTIONAL CENTER,IN CUSTODY,MENARD,True,


In [3]:
# select required columns only

df = df[['id', 'date_of_birth', 'weight', 'height', 'sex', 'hair', 'race', 'admission_date']]
len(df)

61110

In [4]:
# drop all rows with NA values

df.dropna(inplace=True)
len(df)

60713

# Reformat Data and Calculate Required Parameters

## Reformat Dates

Dates need to be reformated as two different formats can be found in the dataset (mm.dd.yy and mm/dd/yyyy).

In [5]:
# reformat date of birth and admission data

# Classify date column by format type
df['format'] = 2
df.loc[df.date_of_birth.str.contains('/'), 'format'] = 1

# Convert to datetime with two different format settings
df['new_date_of_birth'] = pd.to_datetime(df['date_of_birth'])
df.loc[df.format == 1, 'new_date_of_birth'] = pd.to_datetime(df.loc[df.format == 1, 'date_of_birth'], format='%m/%d/%Y')
df.loc[df.format == 2, 'new_date_of_birth'] = pd.to_datetime(df.loc[df.format == 2, 'date_of_birth'], format='%m.%d.%y')


# Classify date column by format type
df['format'] = 2
df.loc[df.admission_date.str.contains('/'), 'format'] = 1

# Convert to datetime with two different format settings
df['new_admission_date'] = pd.to_datetime(df['admission_date'])
df.loc[df.format == 1, 'new_admission_date'] = pd.to_datetime(df.loc[df.format == 1, 'admission_date'], format='%m/%d/%Y')
df.loc[df.format == 2, 'new_admission_date'] = pd.to_datetime(df.loc[df.format == 2, 'admission_date'], format='%m.%d.%y')


# format the datetime objects as strings in the desired output format
df['new_date_of_birth'] = df['new_date_of_birth'].dt.strftime('%m.%d.%Y')
df['new_admission_date'] = df['new_admission_date'].dt.strftime('%m.%d.%Y')

# remove original date column and format column
df = df.drop(columns=['date_of_birth','admission_date', 'format'])

# display the updated dataframe
print(df.head())

       id  weight  height   sex                    hair   race  \
0  A00147   185.0    67.0  Male                   Brown  White   
1  A00220   155.0    73.0  Male                   Black  Black   
2  A00360   167.0    69.0  Male  Gray or Partially Gray  White   
3  A00367   245.0    72.0  Male                   Black  Black   
4  A01054   166.0    67.0  Male         Salt and Pepper  Black   

  new_date_of_birth new_admission_date  
0        06.14.1949         02.16.1983  
1        03.30.1957         05.19.2016  
2        12.18.1946         02.26.1988  
3        01.12.2054         11.09.2017  
4        03.25.1954         12.23.1988  


In [6]:
# subtract 100 for years larger than 2023

# convert the 'date' column to datetime format
df['new_date_of_birth'] = pd.to_datetime(df['new_date_of_birth'], format='%m.%d.%Y')

df['new_date_of_birth'] = df['new_date_of_birth'].apply(lambda x: x.replace(year=x.year-100) if x.year > 2023 else x).dt.strftime('%m.%d.%Y')


# same for admission data

df['new_admission_date'] = pd.to_datetime(df['new_admission_date'], format='%m.%d.%Y')

df['new_admission_date'] = df['new_admission_date'].apply(lambda x: x.replace(year=x.year-100) if x.year > 2023 else x).dt.strftime('%m.%d.%Y')


# display the updated dataframe
df.head()

Unnamed: 0,id,weight,height,sex,hair,race,new_date_of_birth,new_admission_date
0,A00147,185.0,67.0,Male,Brown,White,06.14.1949,02.16.1983
1,A00220,155.0,73.0,Male,Black,Black,03.30.1957,05.19.2016
2,A00360,167.0,69.0,Male,Gray or Partially Gray,White,12.18.1946,02.26.1988
3,A00367,245.0,72.0,Male,Black,Black,01.12.1954,11.09.2017
4,A01054,166.0,67.0,Male,Salt and Pepper,Black,03.25.1954,12.23.1988


## Calculate new Parameters

### Age

In [7]:
# calculate age based on difference between DOB and admission data

df["age"] = df["new_admission_date"].str[-4:].astype(int) - df["new_date_of_birth"].str[-4:].astype(int)
df.head()

Unnamed: 0,id,weight,height,sex,hair,race,new_date_of_birth,new_admission_date,age
0,A00147,185.0,67.0,Male,Brown,White,06.14.1949,02.16.1983,34
1,A00220,155.0,73.0,Male,Black,Black,03.30.1957,05.19.2016,59
2,A00360,167.0,69.0,Male,Gray or Partially Gray,White,12.18.1946,02.26.1988,42
3,A00367,245.0,72.0,Male,Black,Black,01.12.1954,11.09.2017,63
4,A01054,166.0,67.0,Male,Salt and Pepper,Black,03.25.1954,12.23.1988,34


In [8]:
# drop columns that are no longer needed
df = df.drop(["new_admission_date","new_date_of_birth"], axis = 1)
len(df)

60713

In [9]:
# remove entries with age lower than 18
df.drop(index=df[df['age'] < 18].index, inplace=True)
len(df)

60563

### Convert inches and lbs to centimeters and kilograms

In [10]:
# change heigt from inches to centimeters

df["height"] = df["height"]*2.54

# change weight from lbs to kg

df["weight"] = df["weight"]*0.453592

df.head()

Unnamed: 0,id,weight,height,sex,hair,race,age
0,A00147,83.91452,170.18,Male,Brown,White,34
1,A00220,70.30676,185.42,Male,Black,Black,59
2,A00360,75.749864,175.26,Male,Gray or Partially Gray,White,42
3,A00367,111.13004,182.88,Male,Black,Black,63
4,A01054,75.296272,170.18,Male,Salt and Pepper,Black,34


### Calculate BMI

In [11]:
# calculate BMI

df["BMI"] = df["weight"] / (df["height"]/100)**2

df.head()

Unnamed: 0,id,weight,height,sex,hair,race,age,BMI
0,A00147,83.91452,170.18,Male,Brown,White,34,28.974775
1,A00220,70.30676,185.42,Male,Black,Black,59,20.449558
2,A00360,75.749864,175.26,Male,Gray or Partially Gray,White,42,24.661316
3,A00367,111.13004,182.88,Male,Black,Black,63,33.227605
4,A01054,75.296272,170.18,Male,Salt and Pepper,Black,34,25.998988


In oder to be stratify the data based on BMI, the official BMI categories are added.

In [12]:
# define function for BMI classification

def add_bmi_class(df):
    conditions = [
        df['BMI'] < 18.5,
        (df['BMI'] >= 18.5) & (df['BMI'] < 25),
        (df['BMI'] >= 25) & (df['BMI'] < 30),
        (df['BMI'] >= 30) & (df['BMI'] < 35),
        (df['BMI'] >= 35) & (df['BMI'] < 40),
        df['BMI'] >= 40
    ]
    choices = [
        'Underweight',
        'Healthy weight',
        'Overweight',
        'Class 1 Obesity',
        'Class 2 Obesity',
        'Class 3 Obesity'
    ]
    df['bmi_class'] = np.select(conditions, choices)
    return df

In [13]:
# check the number of occurences for each BMI class

df = add_bmi_class(df)
counts = df['bmi_class'].value_counts()
print(counts)
df.head()

Overweight         24885
Healthy weight     18223
Class 1 Obesity    11810
Class 2 Obesity     3687
Class 3 Obesity     1624
Underweight          334
Name: bmi_class, dtype: int64


Unnamed: 0,id,weight,height,sex,hair,race,age,BMI,bmi_class
0,A00147,83.91452,170.18,Male,Brown,White,34,28.974775,Overweight
1,A00220,70.30676,185.42,Male,Black,Black,59,20.449558,Healthy weight
2,A00360,75.749864,175.26,Male,Gray or Partially Gray,White,42,24.661316,Healthy weight
3,A00367,111.13004,182.88,Male,Black,Black,63,33.227605,Class 1 Obesity
4,A01054,75.296272,170.18,Male,Salt and Pepper,Black,34,25.998988,Overweight


In [15]:
len(df)

60563

### Sex

In [16]:
# Replace female and male by 0 and 1

df["sex"] = df["sex"].replace({'Female': 0, 'Male': 1})

In [17]:
# check the distribution among males and females

df = add_bmi_class(df)
counts = df['sex'].value_counts()
print(counts)
df.head()

1    56836
0     3727
Name: sex, dtype: int64


Unnamed: 0,id,weight,height,sex,hair,race,age,BMI,bmi_class
0,A00147,83.91452,170.18,1,Brown,White,34,28.974775,Overweight
1,A00220,70.30676,185.42,1,Black,Black,59,20.449558,Healthy weight
2,A00360,75.749864,175.26,1,Gray or Partially Gray,White,42,24.661316,Healthy weight
3,A00367,111.13004,182.88,1,Black,Black,63,33.227605,Class 1 Obesity
4,A01054,75.296272,170.18,1,Salt and Pepper,Black,34,25.998988,Overweight


In [18]:
# save final df as csv
df.to_csv("labels_inmates_complete.csv", index = False)