<a href="https://colab.research.google.com/github/patrickstr17/predicting_criminals/blob/main/Recidivism.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Libraries



In [None]:
# Importing models
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import re
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

##Ingest Data

In [None]:
# Load the dataset
file_path = '/content/3-Year_Recidivism_for_Offenders_Released_from_Prison_in_Iowa_elaborated.csv'
data = pd.read_csv(file_path)


##EDA

In [None]:
# Display the first few rows and the column information
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26020 entries, 0 to 26019
Data columns (total 12 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   Fiscal Year Released                       26020 non-null  int64 
 1   Recidivism Reporting Year                  26020 non-null  int64 
 2   Race - Ethnicity                           25990 non-null  object
 3   Age At Release                             26017 non-null  object
 4   Convicting Offense Classification          26020 non-null  object
 5   Convicting Offense Type                    26020 non-null  object
 6   Convicting Offense Subtype                 26020 non-null  object
 7   Main Supervising District                  16439 non-null  object
 8   Release Type                               24258 non-null  object
 9   Release type: Paroled to Detainder united  24258 non-null  object
 10  Part of Target Population         

(   Fiscal Year Released  Recidivism Reporting Year      Race - Ethnicity  \
 0                  2010                       2013  White - Non-Hispanic   
 1                  2010                       2013  White - Non-Hispanic   
 2                  2010                       2013  White - Non-Hispanic   
 3                  2010                       2013  White - Non-Hispanic   
 4                  2010                       2013  Black - Non-Hispanic   
 
   Age At Release  Convicting Offense Classification Convicting Offense Type  \
 0        Under 25                          D Felony                 Violent   
 1    55 and Older                          D Felony            Public Order   
 2           25-34                          D Felony                Property   
 3    55 and Older                          C Felony                    Drug   
 4           25-34                          D Felony                    Drug   
 
   Convicting Offense Subtype Main Supervising Distric

In [None]:
data.info()

In [1]:
data.describe

NameError: name 'data' is not defined

In [None]:
data.shape

In [None]:
data.columns

In [None]:
# Handling missing values
# For categorical variables, we can use the most frequent value imputation
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Encoding categorical variables
label_encoders = {}
for column in data_imputed.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_imputed[column] = le.fit_transform(data_imputed[column])
    label_encoders[column] = le

# Display the transformed data
data_imputed.head()

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Race - Ethnicity,Age At Release,Convicting Offense Classification,Convicting Offense Type,Convicting Offense Subtype,Main Supervising District,Release Type,Release type: Paroled to Detainder united,Part of Target Population,Recidivism - Return to Prison numeric
0,0,0,10,4,4,4,3,3,2,1,1,1
1,0,0,10,3,4,3,10,6,2,1,1,1
2,0,0,10,0,4,2,4,4,2,1,1,1
3,0,0,10,3,3,0,23,7,2,1,1,1
4,0,0,6,0,4,0,23,2,2,1,1,1


##Data Prep

In [None]:
# Define features and target
X = data_imputed.drop('Recidivism - Return to Prison numeric', axis=1)
y = data_imputed['Recidivism - Return to Prison numeric']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


##Modeling

### Model exploration and selection

In [None]:
# Initialize the models
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine': SVC()
}

# Train the models and evaluate them
model_scores = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    model_scores[name] = model.score(X_test, y_test)



In [None]:
model_scores

{'Decision Tree': 0.6245196003074558,
 'Random Forest': 0.6297719702792723,
 'Gradient Boosting': 0.6789648987957981,
 'Logistic Regression': 0.668460158852165,
 'Support Vector Machine': 0.6693569049449142}

In [None]:
# Initialize a dictionary to hold confusion matrices for each model
confusion_matrices = {}

# Calculate confusion matrices for each model
for name, model in models.items():
    y_pred = model.predict(X_test)
    confusion_matrices[name] = confusion_matrix(y_test, y_pred)

confusion_matrices

{'Decision Tree': array([[4118, 1107],
        [1824,  757]]),
 'Random Forest': array([[4112, 1113],
        [1777,  804]]),
 'Gradient Boosting': array([[5023,  202],
        [2304,  277]]),
 'Logistic Regression': array([[5149,   76],
        [2512,   69]]),
 'Support Vector Machine': array([[5225,    0],
        [2581,    0]])}

In [None]:
# Initialize a dictionary to hold classification reports for each model
classification_reports = {}

# Calculate classification reports for each model
for name, model in models.items():
    y_pred = model.predict(X_test)
    classification_reports[name] = classification_report(y_test, y_pred, output_dict=True)

classification_reports

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Decision Tree': {'0': {'precision': 0.6930326489397509,
   'recall': 0.788133971291866,
   'f1-score': 0.7375302229784186,
   'support': 5225.0},
  '1': {'precision': 0.4061158798283262,
   'recall': 0.2932971716388997,
   'f1-score': 0.340607424071991,
   'support': 2581.0},
  'accuracy': 0.6245196003074558,
  'macro avg': {'precision': 0.5495742643840386,
   'recall': 0.5407155714653828,
   'f1-score': 0.5390688235252048,
   'support': 7806.0},
  'weighted avg': {'precision': 0.5981656003775441,
   'recall': 0.6245196003074558,
   'f1-score': 0.6062904402500701,
   'support': 7806.0}},
 'Random Forest': {'0': {'precision': 0.6982509763966718,
   'recall': 0.7869856459330143,
   'f1-score': 0.7399676084218103,
   'support': 5225.0},
  '1': {'precision': 0.4194053208137715,
   'recall': 0.3115071677644324,
   'f1-score': 0.3574922187638951,
   'support': 2581.0},
  'accuracy': 0.6297719702792723,
  'macro avg': {'precision': 0.5588281486052217,
   'recall': 0.5492464068487234,
   'f1

**Decision Tree**
Precision: 69.32% (Class 0), 40.67% (Class 1)
Recall: 78.83% (Class 0), 29.37% (Class 1)
F1-score: 73.77% (Class 0), 34.11% (Class 1)
Accuracy: 62.48%
**Random Forest**
Precision: 69.65% (Class 0), 41.35% (Class 1)
Recall: 78.39% (Class 0), 30.84% (Class 1)
F1-score: 73.76% (Class 0), 35.33% (Class 1)
Accuracy: 62.67%
**Gradient Boosting**
Precision: 68.55% (Class 0), 57.83% (Class 1)
Recall: 96.13% (Class 0), 10.73% (Class 1)
F1-score: 80.04% (Class 0), 18.10% (Class 1)
Accuracy: 67.90%
**Logistic Regression**
Precision: 67.21% (Class 0), 47.59% (Class 1)
Recall: 98.55% (Class 0), 2.67% (Class 1)
F1-score: 79.92% (Class 0), 5.06% (Class 1)
Accuracy: 66.85%
**Support Vector Machine**
Precision: 66.94% (Class 0), 0.00% (Class 1)
Recall: 100.00% (Class 0), 0.00% (Class 1)
F1-score: 80.19% (Class 0), 0.00% (Class 1)
Accuracy: 66.94%

We are keeping Gradient Boosting and Random Forest

###Gradient Boosting

In [None]:
##GRADIENT BOOSTING
#Hyperparameter tuning
# Define parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Initialize Gradient Boosting classifier
gb = GradientBoostingClassifier()

# Set up GridSearchCV
grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search_gb.fit(X_train, y_train)

# Best parameters and score
best_params_gb = grid_search_gb.best_params_
best_score_gb = grid_search_gb.best_score_

best_params_gb, best_score_gb

({'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150},
 0.6740416815596036)

In [None]:
# Make predictions on the test set
y_pred_gb = grid_search_gb.best_estimator_.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_gb)
precision = precision_score(y_test, y_pred_gb)
recall = recall_score(y_test, y_pred_gb)
f1 = f1_score(y_test, y_pred_gb)
conf_matrix = confusion_matrix(y_test, y_pred_gb)

# Display the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.6796
Precision: 0.5697
Recall: 0.1267
F1 Score: 0.2073
Confusion Matrix:
[[4978  247]
 [2254  327]]


Accuracy is relatively decent at approximately 68%, but this metric alone can be misleading, especially with class imbalance.
Precision for the positive class (class 1) is low, indicating that a significant proportion of the predictions made for the positive class are incorrect.
Recall for the positive class is very low, suggesting that the model is missing a large number of actual positive cases.
The F1 Score, which is the harmonic mean of precision and recall, is also low, reflecting poor performance in predicting the positive class.

In [None]:
#Class Weight Adjustment in Gradient Boosting
# Train with class weight
gb = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train, sample_weight=[2 if i == 1 else 1 for i in y_train])

# Predictions
y_pred = gb.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.60      0.67      5225
           1       0.43      0.62      0.51      2581

    accuracy                           0.61      7806
   macro avg       0.60      0.61      0.59      7806
weighted avg       0.65      0.61      0.62      7806



Shows a balanced performance with slightly better recall for Class 1 (0.56) compared to the Gradient Boosting model.
Precision for Class 1 is slightly higher at 0.44, and the overall F1-score for both classes is relatively balanced.

###Random Forest

In [None]:
##RANDOM FOREST
# Random Forest parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV for Random Forest
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)

# Best parameters and best score
rf_best_params = rf_grid_search.best_params_
rf_best_score = rf_grid_search.best_score_

rf_best_params, rf_best_score

({'max_depth': 10,
  'max_features': 'sqrt',
  'min_samples_leaf': 4,
  'min_samples_split': 5,
  'n_estimators': 300},
 0.6736574381627226)

In [None]:

# Initialize the Random Forest with the best parameters
best_rf = RandomForestClassifier(
    max_depth=rf_best_params['max_depth'],
    max_features=rf_best_params['max_features'],
    min_samples_leaf=rf_best_params['min_samples_leaf'],
    min_samples_split=rf_best_params['min_samples_split'],
    n_estimators=rf_best_params['n_estimators'],
    random_state=42
)

# Train the model
best_rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = best_rf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.96      0.80      5225
           1       0.54      0.09      0.16      2581

    accuracy                           0.67      7806
   macro avg       0.61      0.53      0.48      7806
weighted avg       0.64      0.67      0.59      7806

Confusion Matrix:
[[5021  204]
 [2339  242]]


High Precision for Class 0: The model performs well in predicting non-recidivism cases (Class 0), with high precision and recall, indicating that most of the predictions for this class are correct.
Low Recall for Class 1: The model struggles with recall for the recidivism class (Class 1), meaning it fails to identify a large number of actual recidivism cases.
Class Imbalance: The low recall and F1-score for Class 1 suggest that the model might be biased towards the majority class (Class 0). This is a common issue in imbalanced datasets where the model might not learn enough about the minority class.

In [None]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Initialize the Random Forest with the best parameters found earlier
best_rf = RandomForestClassifier(
    max_depth=10,
    max_features='sqrt',
    min_samples_leaf=4,
    min_samples_split=5,
    n_estimators=300,
    random_state=42
)

# Train the model on the resampled data
best_rf.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred_rf = best_rf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.65      0.69      5225
           1       0.44      0.56      0.49      2581

    accuracy                           0.62      7806
   macro avg       0.59      0.60      0.59      7806
weighted avg       0.64      0.62      0.63      7806

Confusion Matrix:
[[3386 1839]
 [1148 1433]]


Improved recall for Class 1 (0.62), indicating better identification of the minority class.
However, precision for Class 1 is relatively low (0.43), which means there are still a significant number of false positives.

###Combining Methods

In [None]:
## COMBINING METHODS
# Initialize the Gradient Boosting and Random Forest models with the best parameters
gb = GradientBoostingClassifier(
    n_estimators=150, learning_rate=0.1, max_depth=3, random_state=42,
    subsample=1.0, min_samples_split=2, min_samples_leaf=1
)
rf = RandomForestClassifier(
    max_depth=10, max_features='sqrt', min_samples_leaf=4,
    min_samples_split=5, n_estimators=300, random_state=42
)

# Create a Voting Classifier with the Gradient Boosting and Random Forest models
voting_clf = VotingClassifier(
    estimators=[('gb', gb), ('rf', rf)],
    voting='soft'  # 'soft' uses predicted probabilities, 'hard' uses predicted class labels
)

# Fit the Voting Classifier on the training data
voting_clf.fit(X_train, y_train)

# Predict on the test data
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred_voting))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_voting))

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.96      0.80      5225
           1       0.57      0.11      0.18      2581

    accuracy                           0.68      7806
   macro avg       0.63      0.53      0.49      7806
weighted avg       0.65      0.68      0.59      7806

Confusion Matrix:
[[5016  209]
 [2305  276]]


In [None]:
# Predict probabilities instead of labels
y_proba = voting_clf.predict_proba(X_test)

# Set a lower threshold for predicting class 1 (recidivism)
threshold = 0.3  # Adjust this value as needed
y_pred_threshold = (y_proba[:, 1] >= threshold).astype(int)

# Evaluate the new predictions
print("Classification Report with Threshold Adjustment:")
print(classification_report(y_test, y_pred_threshold))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_threshold))

Classification Report with Threshold Adjustment:
              precision    recall  f1-score   support

           0       0.78      0.47      0.59      5225
           1       0.41      0.73      0.52      2581

    accuracy                           0.56      7806
   macro avg       0.60      0.60      0.56      7806
weighted avg       0.66      0.56      0.57      7806

Confusion Matrix:
[[2465 2760]
 [ 684 1897]]


Improved Recall for Class 1: The recall for the recidivism class (Class 1) has improved significantly to 0.73, meaning the model is now identifying a larger proportion of true positive cases.
Decrease in Precision for Class 1: The precision for Class 1 has dropped to 0.41, indicating more false positives. This trade-off is common when aiming to increase recall, especially in imbalanced datasets.
Impact on Class 0: The precision for Class 0 remains high at 0.78, but the recall has decreased to 0.47, reflecting a higher number of false negatives.

###XGBoost

In [None]:
##XGBOOST
# Initialize the XGBoost classifier with a focus on class imbalance
xgb = XGBClassifier(scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(), random_state=42)

# Train the model
xgb.fit(X_train, y_train)

# Predict on the test data
y_pred_xgb = xgb.predict(X_test)

# Evaluate the model
print("Classification Report for XGBoost:")
print(classification_report(y_test, y_pred_xgb))

print("Confusion Matrix for XGBoost:")
print(confusion_matrix(y_test, y_pred_xgb))

Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.75      0.62      0.68      5225
           1       0.43      0.59      0.50      2581

    accuracy                           0.61      7806
   macro avg       0.59      0.61      0.59      7806
weighted avg       0.65      0.61      0.62      7806

Confusion Matrix for XGBoost:
[[3251 1974]
 [1063 1518]]


Recall Improvement for Class 1: The recall for the recidivism class (Class 1) is 0.59, indicating that the model is identifying a substantial portion of true positive cases.
Balanced Precision and Recall: The model shows a more balanced performance across both classes compared to previous models, with reasonable precision and recall for Class 1.
Moderate Accuracy: Overall accuracy stands at 0.61, with a weighted average F1-score of 0.62, reflecting a balanced handling of both classes but with room for improvement, particularly in reducing false positives for Class 1.

In [None]:
#Adjusting Class Weights in XGBoost
# Adjust scale_pos_weight to handle class imbalance
xgb = XGBClassifier(scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(), random_state=42)

# Train the model
xgb.fit(X_train, y_train)

# Predict probabilities
y_proba_xgb = xgb.predict_proba(X_test)

# Adjust threshold for class 1
threshold = 0.3  # Lower threshold for higher recall
y_pred_xgb_threshold = (y_proba_xgb[:, 1] >= threshold).astype(int)

# Evaluate the model with adjusted threshold
print("Classification Report with Threshold Adjustment for XGBoost:")
print(classification_report(y_test, y_pred_xgb_threshold))

print("Confusion Matrix with Threshold Adjustment for XGBoost:")
print(confusion_matrix(y_test, y_pred_xgb_threshold))

Classification Report with Threshold Adjustment for XGBoost:
              precision    recall  f1-score   support

           0       0.82      0.23      0.35      5225
           1       0.37      0.90      0.52      2581

    accuracy                           0.45      7806
   macro avg       0.59      0.56      0.44      7806
weighted avg       0.67      0.45      0.41      7806

Confusion Matrix with Threshold Adjustment for XGBoost:
[[1180 4045]
 [ 254 2327]]


Significant Increase in Recall for Class 1: The recall for the recidivism class (Class 1) has substantially increased to 0.90, indicating that the model is identifying a large proportion of true positive cases.
Drop in Precision for Class 1: Precision for Class 1 has decreased to 0.37, reflecting an increase in false positives, which is a typical trade-off when focusing on recall.
Impact on Class 0: The recall for Class 0 (non-recidivism) has dropped significantly to 0.23, resulting in many false negatives. This is reflected in the confusion matrix, where a large number of Class 0 cases are misclassified as Class 1.

###Light GBM

In [None]:
##Light GBM
# Sanitize feature names
X_train.columns = [re.sub('[^A-Za-z0-9_]+', '', col) for col in X_train.columns]
X_test.columns = [re.sub('[^A-Za-z0-9_]+', '', col) for col in X_test.columns]

# Initialize the LightGBM classifier with class weight adjustment
lgb_clf = lgb.LGBMClassifier(class_weight='balanced', random_state=42)

# Train the model
lgb_clf.fit(X_train, y_train)

# Predict on the test data
y_pred_lgb = lgb_clf.predict(X_test)

# Evaluate the model
print("Classification Report for LightGBM:")
print(classification_report(y_test, y_pred_lgb))

print("Confusion Matrix for LightGBM:")
print(confusion_matrix(y_test, y_pred_lgb))

[LightGBM] [Info] Number of positive: 6100, number of negative: 12114
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001705 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 18214, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Classification Report for LightGBM:
              precision    recall  f1-score   support

           0       0.77      0.60      0.67      5225
           1       0.44      0.63      0.51      2581

    accuracy                           0.61      7806
   macro avg       0.60      0.61      0.59      7806
weighted avg       0.66      0.61      0.62      7806

Confusion Matrix for LightGBM:
[[3141 2084]
 [ 964 1617]]


Improved Recall for Class 1: The recall for the recidivism class (Class 1) has improved to 0.63, indicating that the model is successfully identifying a larger proportion of true positive cases.
Balanced Precision and Recall: The precision and recall for both classes are relatively balanced, though the precision for Class 1 is still lower, suggesting there are a significant number of false positives.
Overall Performance: The model demonstrates a reasonable balance between precision and recall, with a macro average F1-score of 0.59.

In [None]:
#class weight adjustment
# Adjust class weights
class_weights = {0: 1, 1: 2}  # Example weights, adjust as needed
lgb_clf = lgb.LGBMClassifier(class_weight=class_weights, random_state=42)

# Train the model
lgb_clf.fit(X_train, y_train)

# Predict on the test data
y_pred_lgb = lgb_clf.predict(X_test)

# Evaluate the model
print("Classification Report for LightGBM with Class Weight Adjustment:")
print(classification_report(y_test, y_pred_lgb))

print("Confusion Matrix for LightGBM with Class Weight Adjustment:")
print(confusion_matrix(y_test, y_pred_lgb))

[LightGBM] [Info] Number of positive: 6100, number of negative: 12114
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001582 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 18214, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501769 -> initscore=0.007074
[LightGBM] [Info] Start training from score 0.007074
Classification Report for LightGBM with Class Weight Adjustment:
              precision    recall  f1-score   support

           0       0.77      0.60      0.67      5225
           1       0.44      0.63      0.52      2581

    accuracy                           0.61      7806
   macro avg       0.60      0.62      0.60      7806
weighted avg       0.66      0.61      0.62      7806

Confusion Matrix for LightGBM with Class Weight Adjustm

Improved Recall for Class 1: The recall for the recidivism class (Class 1) remains strong at 0.63, indicating that the model effectively identifies true positive cases.
Balanced Metrics: The precision and recall for both classes are relatively balanced. However, the precision for Class 1 is still somewhat low at 0.44, suggesting that false positives are present.
Overall Performance: The model's overall performance, with an accuracy of 0.61 and macro average metrics around 0.60, shows a reasonable balance between the two classes

###Staking Ensemble

In [None]:
##Implementing Stacking
# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=42)),
    ('lgb', LGBMClassifier(n_estimators=100, random_state=42))
]

# Define meta-model
meta_model = LogisticRegression()

# Create stacking ensemble
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1)

# Train the stacking ensemble
stacking_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_stacking = stacking_clf.predict(X_test)

# Evaluate the stacking ensemble
print("Classification Report for Stacking Ensemble:")
print(classification_report(y_test, y_pred_stacking))

print("Confusion Matrix for Stacking Ensemble:")
print(confusion_matrix(y_test, y_pred_stacking))

Classification Report for Stacking Ensemble:
              precision    recall  f1-score   support

           0       0.69      0.93      0.80      5225
           1       0.55      0.17      0.26      2581

    accuracy                           0.68      7806
   macro avg       0.62      0.55      0.53      7806
weighted avg       0.65      0.68      0.62      7806

Confusion Matrix for Stacking Ensemble:
[[4859  366]
 [2135  446]]


High Recall for Class 0: The model performs very well in identifying non-recidivism cases (Class 0), with a recall of 0.93, meaning most non-recidivism cases are correctly classified.
Low Recall for Class 1: The recall for the recidivism class (Class 1) is low at 0.17, indicating that the model is not identifying a sufficient number of true positive recidivism cases.
Overall Performance: The ensemble model has an accuracy of 0.68 and shows a relatively high weighted average precision and recall for Class 0 but struggles with Class 1.

In [None]:
# Define base models with class weight adjustment
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, class_weight={0: 1, 1: 2}, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(), random_state=42)),
    ('lgb', LGBMClassifier(n_estimators=100, class_weight={0: 1, 1: 2}, random_state=42))
]

# Define meta-model
meta_model = LogisticRegression()

# Create stacking ensemble with adjusted class weights
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1)

# Train the stacking ensemble
stacking_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_stacking = stacking_clf.predict(X_test)

# Evaluate the stacking ensemble
print("Classification Report for Stacking Ensemble with Class Weight Adjustment:")
print(classification_report(y_test, y_pred_stacking))

print("Confusion Matrix for Stacking Ensemble with Class Weight Adjustment:")
print(confusion_matrix(y_test, y_pred_stacking))

Classification Report for Stacking Ensemble with Class Weight Adjustment:
              precision    recall  f1-score   support

           0       0.69      0.93      0.80      5225
           1       0.55      0.17      0.26      2581

    accuracy                           0.68      7806
   macro avg       0.62      0.55      0.53      7806
weighted avg       0.65      0.68      0.62      7806

Confusion Matrix for Stacking Ensemble with Class Weight Adjustment:
[[4873  352]
 [2152  429]]


High Recall for Class 0: The model continues to perform very well in identifying non-recidivism cases, with a recall of 0.93.
Low Recall for Class 1: Despite the class weight adjustment, the recall for recidivism cases remains low at 0.17, indicating that the model still struggles to identify a sufficient number of true positive cases in Class 1.
Precision for Class 1: The precision for Class 1 is slightly better than previous runs, but the overall improvement is limited, which suggests that further adjustments might be necessary.

**Summary of Key Models and Metrics:**

**Gradient Boosting with Class Weight Adjustment:**
Recall for Class 1: 0.63
Precision for Class 1: 0.44
Overall Accuracy: 0.61

**Random Forest with SMOTE:**
Recall for Class 1: 0.56
Precision for Class 1: 0.44
Overall Accuracy: 0.62

**XGBoost with Threshold Adjustment:**
Recall for Class 1: 0.90 (with a lower threshold)
Precision for Class 1: 0.37
Overall Accuracy: 0.45

**LightGBM with Class Weight Adjustment:**
Recall for Class 1: 0.63
Precision for Class 1: 0.44
Overall Accuracy: 0.61

**Stacking Ensemble with Class Weight Adjustment:**
Recall for Class 1: 0.17
Precision for Class 1: 0.55
Overall Accuracy: 0.68

XGBoost with Threshold Adjustment had the highest recall for Class 1 (0.90), which is critical for identifying as many recidivism cases as possible. However, this came with a trade-off in precision and overall accuracy.
Gradient Boosting and LightGBM with Class Weight Adjustment provided a more balanced approach with decent recall and precision for Class 1, along with better overall accuracy.
Stacking Ensemble showed the highest accuracy overall but struggled with recall for Class 1.

The choice depends on the specific priorities of the project:

Maximize Recall: Choose XGBoost with Threshold Adjustment.
Balance Recall and Precision: Choose Gradient Boosting or LightGBM with Class Weight Adjustment.

###Neural Networks

In [None]:
##Neural Network (MLP)
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the MLPClassifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Train the neural network model
mlp_clf.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred_mlp = mlp_clf.predict(X_test)

# Evaluate the neural network model
print("Classification Report for MLP Classifier with SMOTE:")
print(classification_report(y_test, y_pred_mlp))

print("Confusion Matrix for MLP Classifier with SMOTE:")
print(confusion_matrix(y_test, y_pred_mlp))

Classification Report for MLP Classifier with SMOTE:
              precision    recall  f1-score   support

           0       0.72      0.69      0.70      5225
           1       0.41      0.45      0.43      2581

    accuracy                           0.61      7806
   macro avg       0.56      0.57      0.57      7806
weighted avg       0.62      0.61      0.61      7806

Confusion Matrix for MLP Classifier with SMOTE:
[[3593 1632]
 [1431 1150]]


Balanced Performance: The model shows a relatively balanced performance between Class 0 and Class 1, with both classes having moderate precision and recall.
Improvement in Recall for Class 1: The recall for the recidivism class (Class 1) is 0.45, indicating that the model is capturing more true positives than previous runs without SMOTE.
Precision and Recall Trade-off: The precision for Class 1 is 0.41, which indicates a number of false positives. This trade-off is common when using techniques like SMOTE to increase recall.

In [None]:
# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(100,), (100, 50), (50, 50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'max_iter': [200, 500]
}

# Initialize the MLPClassifier
mlp_clf = MLPClassifier(random_state=42)

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=mlp_clf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Train the best MLP model
best_mlp_clf = grid_search.best_estimator_
best_mlp_clf.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred_best_mlp = best_mlp_clf.predict(X_test)

# Evaluate the model
print("Classification Report for Best MLP Classifier:")
print(classification_report(y_test, y_pred_best_mlp))

print("Confusion Matrix for Best MLP Classifier:")
print(confusion_matrix(y_test, y_pred_best_mlp))



Best Parameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 50), 'learning_rate_init': 0.001, 'max_iter': 500, 'solver': 'adam'}
Best Score: 0.6419432062076936
Classification Report for Best MLP Classifier:
              precision    recall  f1-score   support

           0       0.72      0.63      0.67      5225
           1       0.40      0.49      0.44      2581

    accuracy                           0.59      7806
   macro avg       0.56      0.56      0.56      7806
weighted avg       0.61      0.59      0.60      7806

Confusion Matrix for Best MLP Classifier:
[[3303 1922]
 [1310 1271]]




Best Hyperparameters:
Activation: 'tanh'
Alpha (Regularization): 0.001
Hidden Layer Sizes: (100, 50)
Learning Rate Init: 0.001
Max Iterations: 500
Solver: 'adam'

Recall for Class 1: The recall for Class 1 (recidivism) improved to 0.49, indicating that the model captures more true positive cases than before.
Precision and Recall Balance: The precision for Class 1 is 0.40, showing a trade-off between capturing more true positives and increasing false positives.
Overall Performance: The model shows balanced metrics across both classes, with an accuracy of 0.59.

In [None]:
# Define a pipeline to standardize the data and train the model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(
        hidden_layer_sizes=(128, 64, 32),  # Deeper network with more layers
        activation='relu',                 # ReLU activation function
        solver='adam',                     # Adam optimizer
        alpha=0.001,                       # Regularization term
        learning_rate_init=0.001,          # Initial learning rate
        max_iter=500,                      # Maximum number of iterations
        random_state=42,
        early_stopping=True,               # Use early stopping to prevent overfitting
        validation_fraction=0.1,           # Fraction of training data for validation
        n_iter_no_change=10                # Early stopping patience
    ))
])

# Train the neural network model
pipeline.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred_deep_mlp = pipeline.predict(X_test)

# Evaluate the neural network model
print("Classification Report for Deep MLP Classifier:")
print(classification_report(y_test, y_pred_deep_mlp))

print("Confusion Matrix for Deep MLP Classifier:")
print(confusion_matrix(y_test, y_pred_deep_mlp))

Classification Report for Deep MLP Classifier:
              precision    recall  f1-score   support

           0       0.75      0.51      0.61      5225
           1       0.40      0.66      0.50      2581

    accuracy                           0.56      7806
   macro avg       0.58      0.59      0.55      7806
weighted avg       0.64      0.56      0.57      7806

Confusion Matrix for Deep MLP Classifier:
[[2654 2571]
 [ 867 1714]]


Improved Recall for Class 1: The recall for Class 1 (recidivism) improved significantly to 0.66, indicating that the deeper network is more effective at identifying true positive cases.
Trade-off in Precision: The precision for Class 1 is 0.40, indicating that there are still a number of false positives. This trade-off is common when focusing on improving recall.
Moderate Performance for Class 0: The recall for Class 0 is 0.51, suggesting that the model misses a considerable number of non-recidivism cases.