In [1]:
!pip install xgboost pandas scikit-learn matplotlib seaborn

# Step 2: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix
import xgboost as xgb

# Step 3: Load the Dataset
from google.colab import files
uploaded = files.upload()

# Get the filename dynamically
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)



Saving NIJ_s_Recidivism_Challenge_Full_Dataset_20250222.csv to NIJ_s_Recidivism_Challenge_Full_Dataset_20250222 (2).csv


In [2]:
# prompt: write me the column names

df.columns


Index(['ID', 'Gender', 'Race', 'Age_at_Release', 'Residence_PUMA',
       'Gang_Affiliated', 'Supervision_Risk_Score_First',
       'Supervision_Level_First', 'Education_Level', 'Dependents',
       'Prison_Offense', 'Prison_Years', 'Prior_Arrest_Episodes_Felony',
       'Prior_Arrest_Episodes_Misd', 'Prior_Arrest_Episodes_Violent',
       'Prior_Arrest_Episodes_Property', 'Prior_Arrest_Episodes_Drug',
       'Prior_Arrest_Episodes_PPViolationCharges',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Felony', 'Prior_Conviction_Episodes_Misd',
       'Prior_Conviction_Episodes_Viol', 'Prior_Conviction_Episodes_Prop',
       'Prior_Conviction_Episodes_Drug',
       'Prior_Conviction_Episodes_PPViolationCharges',
       'Prior_Conviction_Episodes_DomesticViolenceCharges',
       'Prior_Conviction_Episodes_GunCharges', 'Prior_Revocations_Parole',
       'Prior_Revocations_Probation', 'Condition_MH_SA', 'Condition_Cog_Ed',
     

In [3]:
# prompt: subset me a dataset with only these columns: "Age_at_Release", 'Supervision_Risk_Score_First', "Education_Level", 'Prison_Offense', Prior_Arrest_Episodes_Felony',
#        'Prior_Arrest_Episodes_Misd', 'Prior_Arrest_Episodes_Violent',
#        'Prior_Arrest_Episodes_Property', 'Prior_Arrest_Episodes_Drug',
#        'Prior_Arrest_Episodes_PPViolationCharges',
#        'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges', 'Percent_Days_Employed', 'Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2',
#        'Recidivism_Arrest_Year3'

selected_columns = ["Age_at_Release", 'Supervision_Risk_Score_First', "Education_Level", 'Prison_Offense', 'Prior_Arrest_Episodes_Felony',
       'Prior_Arrest_Episodes_Misd', 'Prior_Arrest_Episodes_Violent',
       'Prior_Arrest_Episodes_Property', 'Prior_Arrest_Episodes_Drug',
       'Prior_Arrest_Episodes_PPViolationCharges',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges', 'Percent_Days_Employed', 'Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2',
       'Recidivism_Arrest_Year3']

subset_df = df[selected_columns]
print(subset_df.head())


  Age_at_Release  Supervision_Risk_Score_First        Education_Level  \
0          43-47                           3.0  At least some college   
1          33-37                           6.0   Less than HS diploma   
2    48 or older                           7.0  At least some college   
3          38-42                           7.0   Less than HS diploma   
4          33-37                           4.0   Less than HS diploma   

    Prison_Offense Prior_Arrest_Episodes_Felony Prior_Arrest_Episodes_Misd  \
0             Drug                            6                  6 or more   
1  Violent/Non-Sex                            7                  6 or more   
2             Drug                            6                  6 or more   
3         Property                            8                  6 or more   
4  Violent/Non-Sex                            4                          4   

  Prior_Arrest_Episodes_Violent Prior_Arrest_Episodes_Property  \
0                         

In [4]:
# prompt: can you show me the unique values in each of these columns in subset df

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': ['43-47' '33-37' '48 or older' '38-42' '18-22' '23-27' '28-32']
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  2.  1. 10.  9.  8. nan]
Unique values in column 'Education_Level': ['At least some college' 'Less than HS diploma' 'High School Diploma']
Unique values in column 'Prison_Offense': ['Drug' 'Violent/Non-Sex' 'Property' nan 'Other' 'Violent/Sex']
Unique values in column 'Prior_Arrest_Episodes_Felony': ['6' '7' '8' '4' '10 or more' '3' '9' '2' '5' '1' '0']
Unique values in column 'Prior_Arrest_Episodes_Misd': ['6 or more' '4' '0' '1' '3' '5' '2']
Unique values in column 'Prior_Arrest_Episodes_Violent': ['1' '3 or more' '0' '2']
Unique values in column 'Prior_Arrest_Episodes_Property': ['3' '0' '2' '5 or more' '1' '4']
Unique values in column 'Prior_Arrest_Episodes_Drug': ['3' '2' '1' '0' '5 or more' '4']
Unique values in column 'Prior_Arrest_Episodes_PPViolationCharges': ['4' '5 or more' '3' '0' '1' '2']
Uniq

So, until now we have our required columns, just need to clean it to not have any missing values.

In [5]:
# prompt: drop any rows with 'nan', and then again print the unique columns

subset_df = subset_df.dropna()

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': ['43-47' '33-37' '48 or older' '38-42' '18-22' '28-32' '23-27']
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': ['At least some college' 'Less than HS diploma' 'High School Diploma']
Unique values in column 'Prison_Offense': ['Drug' 'Violent/Non-Sex' 'Property' 'Other' 'Violent/Sex']
Unique values in column 'Prior_Arrest_Episodes_Felony': ['6' '7' '8' '4' '10 or more' '3' '9' '2' '5' '1' '0']
Unique values in column 'Prior_Arrest_Episodes_Misd': ['6 or more' '4' '0' '1' '3' '5' '2']
Unique values in column 'Prior_Arrest_Episodes_Violent': ['1' '3 or more' '0' '2']
Unique values in column 'Prior_Arrest_Episodes_Property': ['3' '0' '2' '1' '5 or more' '4']
Unique values in column 'Prior_Arrest_Episodes_Drug': ['3' '2' '1' '0' '5 or more' '4']
Unique values in column 'Prior_Arrest_Episodes_PPViolationCharges': ['4' '5 or more' '3' '0' '1' '2']
Unique value

Age_at_Release: Encode as ordinal values (e.g., 18-22 → 1, 23-27 → 2, etc.).
Education_Level: Encode as ordinal values (e.g., Less than HS → 1, HS Diploma → 2, College → 3).
Prison_Offense: One-hot encode offense type (Drug, Violent, Property, etc.).

In [6]:
# prompt: Age_at_Release: Encode as ordinal values (e.g., 18-22 → 1, 23-27 → 2, etc.)

# Create a mapping for Age_at_Release
age_mapping = {
    '18-22': 1,
    '23-27': 2,
    '28-32': 3,
    '33-37': 4,
    '38-42': 5,
    '43-47': 6,
    '48 or older': 7
}

# Apply the mapping to the 'Age_at_Release' column
subset_df['Age_at_Release'] = subset_df['Age_at_Release'].map(age_mapping)


In [7]:
# prompt: Education_Level: Encode as ordinal values (e.g., Less than HS → 1, HS Diploma → 2, College → 3)
# Unique values in column 'Education_Level': ['At least some college' 'Less than HS diploma' 'High School Diploma']

# Create a mapping for Education_Level
education_mapping = {
    'Less than HS diploma': 1,
    'High School Diploma': 2,
    'At least some college': 3
}

# Apply the mapping to the 'Education_Level' column
subset_df['Education_Level'] = subset_df['Education_Level'].map(education_mapping)


In [8]:
# prompt: Prison_Offense: One-hot encode offense type (Drug, Violent, Property, etc.)
# Unique values in column 'Prison_Offense': ['Drug' 'Violent/Non-Sex' 'Property' 'Other' 'Violent/Sex']

# Perform one-hot encoding on 'Prison_Offense'
prison_offense_encoded = pd.get_dummies(subset_df['Prison_Offense'], prefix='Prison_Offense')

# Concatenate the one-hot encoded columns with the original DataFrame
subset_df = pd.concat([subset_df, prison_offense_encoded], axis=1)

# Drop the original 'Prison_Offense' column
subset_df = subset_df.drop('Prison_Offense', axis=1)


In [9]:
# prompt: now show the unique values in each columns

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Prior_Arrest_Episodes_Felony': ['6' '7' '8' '4' '10 or more' '3' '9' '2' '5' '1' '0']
Unique values in column 'Prior_Arrest_Episodes_Misd': ['6 or more' '4' '0' '1' '3' '5' '2']
Unique values in column 'Prior_Arrest_Episodes_Violent': ['1' '3 or more' '0' '2']
Unique values in column 'Prior_Arrest_Episodes_Property': ['3' '0' '2' '1' '5 or more' '4']
Unique values in column 'Prior_Arrest_Episodes_Drug': ['3' '2' '1' '0' '5 or more' '4']
Unique values in column 'Prior_Arrest_Episodes_PPViolationCharges': ['4' '5 or more' '3' '0' '1' '2']
Unique values in column 'Prior_Arrest_Episodes_DVCharges': [False  True]
Unique values in column 'Prior_Arrest_Episodes_GunCharges': [False  True]
Unique values in column 'Percent_Days_Employed': [0.48856209 0.42523364 0.    

now we have to handle or more, and delete those, and after that merge all columns with Prior_Arrest into one column by adding those values together

In [10]:
# prompt: Unique values in column 'Prior_Arrest_Episodes_Felony': ['6' '7' '8' '4' '10 or more' '3' '9' '2' '5' '1' '0']
# Unique values in column 'Prior_Arrest_Episodes_Misd': ['6 or more' '4' '0' '1' '3' '5' '2']
# Unique values in column 'Prior_Arrest_Episodes_Violent': ['1' '3 or more' '0' '2']
# Unique values in column 'Prior_Arrest_Episodes_Property': ['3' '0' '2' '1' '5 or more' '4']
# Unique values in column 'Prior_Arrest_Episodes_Drug': ['3' '2' '1' '0' '5 or more' '4']
# Unique values in column 'Prior_Arrest_Episodes_PPViolationCharges': ['4' '5 or more' '3' '0' '1' '2']
# FOr these columns, there are '__ or more'. How can we remove that ' or more' part and only leave with numbers.

# Define a function to clean the 'or more' values
def clean_or_more(value):
    if isinstance(value, str) and ' or more' in value:
        return int(value.split()[0])
    return value

# Apply the cleaning function to the relevant columns
columns_to_clean = ['Prior_Arrest_Episodes_Felony', 'Prior_Arrest_Episodes_Misd',
                    'Prior_Arrest_Episodes_Violent', 'Prior_Arrest_Episodes_Property',
                    'Prior_Arrest_Episodes_Drug', 'Prior_Arrest_Episodes_PPViolationCharges']

for col in columns_to_clean:
    subset_df[col] = subset_df[col].apply(clean_or_more).astype(int)

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Prior_Arrest_Episodes_Felony': [ 6  7  8  4 10  3  9  2  5  1  0]
Unique values in column 'Prior_Arrest_Episodes_Misd': [6 4 0 1 3 5 2]
Unique values in column 'Prior_Arrest_Episodes_Violent': [1 3 0 2]
Unique values in column 'Prior_Arrest_Episodes_Property': [3 0 2 1 5 4]
Unique values in column 'Prior_Arrest_Episodes_Drug': [3 2 1 0 5 4]
Unique values in column 'Prior_Arrest_Episodes_PPViolationCharges': [4 5 3 0 1 2]
Unique values in column 'Prior_Arrest_Episodes_DVCharges': [False  True]
Unique values in column 'Prior_Arrest_Episodes_GunCharges': [False  True]
Unique values in column 'Percent_Days_Employed': [0.48856209 0.42523364 0.         ... 0.43492587 0.57474227 0.18151815]
Unique values in column 'Recidivism_Arrest_Year1': [False  True]
Unique val

In [11]:
# prompt: Can we just add
# Unique values in column 'Prior_Arrest_Episodes_Felony': [ 6  7  8  4 10  3  9  2  5  1  0]
# Unique values in column 'Prior_Arrest_Episodes_Misd': [6 4 0 1 3 5 2]
# Unique values in column 'Prior_Arrest_Episodes_Violent': [1 3 0 2]
# Unique values in column 'Prior_Arrest_Episodes_Property': [3 0 2 1 5 4]
# Unique values in column 'Prior_Arrest_Episodes_Drug': [3 2 1 0 5 4]
# Unique values in column 'Prior_Arrest_Episodes_PPViolationCharges': [4 5 3 0 1 2]
# into one column called Prior Arrest Episodes with value equal to the sum of the corresponding values from all these columns

# Calculate the sum of prior arrest episodes
subset_df['Prior_Arrest_Episodes'] = (subset_df['Prior_Arrest_Episodes_Felony'] +
                                      subset_df['Prior_Arrest_Episodes_Misd'] +
                                      subset_df['Prior_Arrest_Episodes_Violent'] +
                                      subset_df['Prior_Arrest_Episodes_Property'] +
                                      subset_df['Prior_Arrest_Episodes_Drug'] +
                                      subset_df['Prior_Arrest_Episodes_PPViolationCharges'])

# Drop the individual prior arrest episode columns
subset_df = subset_df.drop(columns=['Prior_Arrest_Episodes_Felony',
                                    'Prior_Arrest_Episodes_Misd',
                                    'Prior_Arrest_Episodes_Violent',
                                    'Prior_Arrest_Episodes_Property',
                                    'Prior_Arrest_Episodes_Drug',
                                    'Prior_Arrest_Episodes_PPViolationCharges'])

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Prior_Arrest_Episodes_DVCharges': [False  True]
Unique values in column 'Prior_Arrest_Episodes_GunCharges': [False  True]
Unique values in column 'Percent_Days_Employed': [0.48856209 0.42523364 0.         ... 0.43492587 0.57474227 0.18151815]
Unique values in column 'Recidivism_Arrest_Year1': [False  True]
Unique values in column 'Recidivism_Arrest_Year2': [False  True]
Unique values in column 'Recidivism_Arrest_Year3': [False  True]
Unique values in column 'Prison_Offense_Drug': [ True False]
Unique values in column 'Prison_Offense_Other': [False  True]
Unique values in column 'Prison_Offense_Property': [False  True]
Unique values in column 'Prison_Offense_Violent/Non-Sex': [False  True]
Unique values in column 'Prison_Offense_Violent/Sex': [False  True]
Un

In [12]:
# prompt: Unique values in column 'Prior_Arrest_Episodes_DVCharges': [False  True]
# Unique values in column 'Prior_Arrest_Episodes_GunCharges': [False  True]
# Merge this also into this: 'Prior_Arrest_Episodes_Drug_or_Gun_Charges' and have value of 0 if both False, or 1 if any of them true, and drop those two columns

# Create 'Prior_Arrest_Episodes_Drug_or_Gun_Charges' column
subset_df['Prior_Arrest_Episodes_Drug_or_Gun_Charges'] = ((subset_df['Prior_Arrest_Episodes_DVCharges'] == True) |
                                                        (subset_df['Prior_Arrest_Episodes_GunCharges'] == True)).astype(int)

# Drop the original columns
subset_df = subset_df.drop(columns=['Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges'])

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Percent_Days_Employed': [0.48856209 0.42523364 0.         ... 0.43492587 0.57474227 0.18151815]
Unique values in column 'Recidivism_Arrest_Year1': [False  True]
Unique values in column 'Recidivism_Arrest_Year2': [False  True]
Unique values in column 'Recidivism_Arrest_Year3': [False  True]
Unique values in column 'Prison_Offense_Drug': [ True False]
Unique values in column 'Prison_Offense_Other': [False  True]
Unique values in column 'Prison_Offense_Property': [False  True]
Unique values in column 'Prison_Offense_Violent/Non-Sex': [False  True]
Unique values in column 'Prison_Offense_Violent/Sex': [False  True]
Unique values in column 'Prior_Arrest_Episodes': [23 24 17  8 27 31 16 18 20 25 26 19 30 22 14 29 21  3 28  4 13 33 15 12
 32  6 34 11  5 10  2  9  7

In [13]:
# prompt: Similarly,
# Unique values in column 'Recidivism_Arrest_Year1': [False  True]
# Unique values in column 'Recidivism_Arrest_Year2': [False  True]
# Unique values in column 'Recidivism_Arrest_Year3': [False  True]
# Can we also add "Recidivsim_Arrest_Last_3_Years" where if all three False, we can put 0 else 1, and then drop rest of the columns

# Create 'Recidivism_Arrest_Last_3_Years' column
subset_df['Recidivism_Arrest_Last_3_Years'] = ((subset_df['Recidivism_Arrest_Year1'] == True) |
                                             (subset_df['Recidivism_Arrest_Year2'] == True) |
                                             (subset_df['Recidivism_Arrest_Year3'] == True)).astype(int)

# Drop the original recidivism columns
subset_df = subset_df.drop(columns=['Recidivism_Arrest_Year1', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3'])

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Percent_Days_Employed': [0.48856209 0.42523364 0.         ... 0.43492587 0.57474227 0.18151815]
Unique values in column 'Prison_Offense_Drug': [ True False]
Unique values in column 'Prison_Offense_Other': [False  True]
Unique values in column 'Prison_Offense_Property': [False  True]
Unique values in column 'Prison_Offense_Violent/Non-Sex': [False  True]
Unique values in column 'Prison_Offense_Violent/Sex': [False  True]
Unique values in column 'Prior_Arrest_Episodes': [23 24 17  8 27 31 16 18 20 25 26 19 30 22 14 29 21  3 28  4 13 33 15 12
 32  6 34 11  5 10  2  9  7  1  0]
Unique values in column 'Prior_Arrest_Episodes_Drug_or_Gun_Charges': [0 1]
Unique values in column 'Recidivism_Arrest_Last_3_Years': [0 1]


In [14]:
# prompt: Unique values in column 'Prison_Offense_Drug': [ True False]
# Unique values in column 'Prison_Offense_Other': [False  True]
# Unique values in column 'Prison_Offense_Property': [False  True]
# Unique values in column 'Prison_Offense_Violent/Non-Sex': [False  True]
# Unique values in column 'Prison_Offense_Violent/Sex': [False  True]
# here, check if one of them is True, is there also any other columns true out of these selected columns. Generally, I am looking for combined unique values across these columns

# Define the columns to check
offense_columns = ['Prison_Offense_Drug', 'Prison_Offense_Other', 'Prison_Offense_Property',
                   'Prison_Offense_Violent/Non-Sex', 'Prison_Offense_Violent/Sex']

# Create a new column indicating if any of the offense columns are True
subset_df['Any_Offense'] = subset_df[offense_columns].any(axis=1).astype(int)

# Create a new column to store the combined offense type
subset_df['Combined_Offense'] = ""

# Iterate through the rows of the DataFrame
for index, row in subset_df.iterrows():
    combined_offense = ""
    # Check each of the offense columns
    for col in offense_columns:
        if row[col]:
            if combined_offense:
                combined_offense += "/" + col.replace("Prison_Offense_", "")
            else:
                combined_offense = col.replace("Prison_Offense_", "")
    subset_df.loc[index, 'Combined_Offense'] = combined_offense

# Drop individual offense columns
subset_df = subset_df.drop(columns=offense_columns)


for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Percent_Days_Employed': [0.48856209 0.42523364 0.         ... 0.43492587 0.57474227 0.18151815]
Unique values in column 'Prior_Arrest_Episodes': [23 24 17  8 27 31 16 18 20 25 26 19 30 22 14 29 21  3 28  4 13 33 15 12
 32  6 34 11  5 10  2  9  7  1  0]
Unique values in column 'Prior_Arrest_Episodes_Drug_or_Gun_Charges': [0 1]
Unique values in column 'Recidivism_Arrest_Last_3_Years': [0 1]
Unique values in column 'Any_Offense': [1]
Unique values in column 'Combined_Offense': ['Drug' 'Violent/Non-Sex' 'Property' 'Other' 'Violent/Sex']


In [15]:
# prompt: Binning Prior Arrest Episodes
# We will group "Prior_Arrest_Episodes" into three categories:
# 0-3 arrests → Assign 0 (Low prior offenses)
# 4-7 arrests → Assign 1 (Moderate prior offenses)
# 8+ arrests → Assign 2 (High prior offenses)

# Binning Prior_Arrest_Episodes
def bin_prior_arrests(arrests):
    if 0 <= arrests <= 3:
        return 0
    elif 4 <= arrests <= 7:
        return 1
    else:
        return 2

subset_df['Prior_Arrest_Episodes'] = subset_df['Prior_Arrest_Episodes'].apply(bin_prior_arrests)

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Percent_Days_Employed': [0.48856209 0.42523364 0.         ... 0.43492587 0.57474227 0.18151815]
Unique values in column 'Prior_Arrest_Episodes': [2 0 1]
Unique values in column 'Prior_Arrest_Episodes_Drug_or_Gun_Charges': [0 1]
Unique values in column 'Recidivism_Arrest_Last_3_Years': [0 1]
Unique values in column 'Any_Offense': [1]
Unique values in column 'Combined_Offense': ['Drug' 'Violent/Non-Sex' 'Property' 'Other' 'Violent/Sex']


In [16]:
# prompt: Binning Employment Percentage
# We will group "Percent_Days_Employed" into four categories:
# 0-25% employment → Assign 0 (Very low employment)
# 25-50% employment → Assign 1 (Low employment)
# 50-75% employment → Assign 2 (Moderate employment)
# 75%+ employment → Assign 3 (High employment)

# Binning Percent_Days_Employed
def bin_employment(percentage):
    if 0 <= percentage <= 25:
        return 0
    elif 25 < percentage <= 50:
        return 1
    elif 50 < percentage <= 75:
        return 2
    else:
        return 3

subset_df['Percent_Days_Employed'] = subset_df['Percent_Days_Employed'].apply(bin_employment)

for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")


Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Percent_Days_Employed': [0]
Unique values in column 'Prior_Arrest_Episodes': [2 0 1]
Unique values in column 'Prior_Arrest_Episodes_Drug_or_Gun_Charges': [0 1]
Unique values in column 'Recidivism_Arrest_Last_3_Years': [0 1]
Unique values in column 'Any_Offense': [1]
Unique values in column 'Combined_Offense': ['Drug' 'Violent/Non-Sex' 'Property' 'Other' 'Violent/Sex']


In [17]:
# Define mapping for Combined_Offense
combined_offense_mapping = {
    'Drug': 0,
    'Violent/Non-Sex': 1,
    'Property': 2,
    'Other': 3,
    'Violent/Sex': 4
}

# Apply the mapping to encode 'Combined_Offense' as a single numerical column
subset_df['Combined_Offense'] = subset_df['Combined_Offense'].map(combined_offense_mapping)

# Check unique values after encoding
print(subset_df['Combined_Offense'].unique())


[0 1 2 3 4]


In [19]:
for col in subset_df.columns:
    print(f"Unique values in column '{col}': {subset_df[col].unique()}")

Unique values in column 'Age_at_Release': [6 4 7 5 1 3 2]
Unique values in column 'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.]
Unique values in column 'Education_Level': [3 1 2]
Unique values in column 'Percent_Days_Employed': [0]
Unique values in column 'Prior_Arrest_Episodes': [2 0 1]
Unique values in column 'Prior_Arrest_Episodes_Drug_or_Gun_Charges': [0 1]
Unique values in column 'Recidivism_Arrest_Last_3_Years': [0 1]
Unique values in column 'Any_Offense': [1]
Unique values in column 'Combined_Offense': [0 1 2 3 4]


In [20]:
# Now we are ready for modelling

In [21]:
# prompt: split into 80-20 and then using  'Supervision_Risk_Score_First': [ 3.  6.  7.  4.  5.  1.  2. 10.  9.  8.] column as target variable make an XGBoost model

# Split data into training and testing sets (80/20 split)
X = subset_df.drop('Supervision_Risk_Score_First', axis=1)
y = subset_df['Supervision_Risk_Score_First']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)  # Use reg:squarederror for regression
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model (example: Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


Mean Absolute Error: 1.5976746336263976


In [22]:
import pickle
# Save the trained XGBoost model
with open("xgboost_model.pkl", "wb") as file:
    pickle.dump(model, file)
print("Model saved successfully as xgboost_model.pkl")


Model saved successfully as xgboost_model.pkl


In [23]:
# prompt: give me top 5 rows of subset_df

print(subset_df.head())


   Age_at_Release  Supervision_Risk_Score_First  Education_Level  \
0               6                           3.0                3   
1               4                           6.0                1   
2               7                           7.0                3   
3               5                           7.0                1   
4               4                           4.0                1   

   Percent_Days_Employed  Prior_Arrest_Episodes  \
0                      0                      2   
1                      0                      2   
2                      0                      2   
3                      0                      2   
4                      0                      2   

   Prior_Arrest_Episodes_Drug_or_Gun_Charges  Recidivism_Arrest_Last_3_Years  \
0                                          0                               0   
1                                          1                               1   
2                                          1