# ML
## Loading Data

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load and preprocess
block_df = pd.read_csv('../data/Census_Block_Groups_20250419.csv')
crime_df = pd.read_csv('../data/Crime_Reports_20250419.csv')  # Replace with your actual file path
crime_df = crime_df.dropna(subset=['Census Block Group']).copy()


## ML Model Implementation

### Random Forest

In [24]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Your existing preprocessing code (keep this)

le_offense = LabelEncoder()
le_block = LabelEncoder()
crime_df['OffenseCode'] = le_offense.fit_transform(crime_df['Highest Offense Description'])
crime_df['BlockGroupEncoded'] = le_block.fit_transform(crime_df['Census Block Group'].astype(str))

# --- NEW: Split data before aggregating ---
# Split at individual crime level (before grouping)
train_df, test_df = train_test_split(
    crime_df, 
    test_size=0.2, 
    random_state=42,
    stratify=crime_df['OffenseCode']  # Preserves class balance
)

# Aggregate training data only (to avoid data leakage)
grouped_train = train_df.groupby(['BlockGroupEncoded', 'Location Type']).agg(
    {'OffenseCode': lambda x: x.mode()[0]}
).reset_index()

# Prepare features (same as before)
X_train = pd.get_dummies(grouped_train[['BlockGroupEncoded', 'Location Type']])
y_train = grouped_train['OffenseCode']

# Train model (replace with your chosen classifier)
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

# --- Evaluate on test data ---
# Aggregate test data same way
grouped_test = test_df.groupby(['BlockGroupEncoded', 'Location Type']).agg(
    {'OffenseCode': lambda x: x.mode()[0]}
).reset_index()

X_test = pd.get_dummies(grouped_test[['BlockGroupEncoded', 'Location Type']])
y_test = grouped_test['OffenseCode']

# Generate predictions
y_pred = model.predict(X_test)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=le_offense.classes_))

                                precision    recall  f1-score   support

ASSAULT W/INJURY-FAM/DATE VIOL       0.09      0.09      0.09       204
           ASSAULT WITH INJURY       0.27      0.27      0.27       498
                    AUTO THEFT       0.32      0.30      0.31       268
        BURGLARY NON RESIDENCE       0.39      0.39      0.39      1043
         BURGLARY OF RESIDENCE       0.34      0.20      0.25        99
           BURGLARY OF VEHICLE       0.62      0.63      0.63      1642
             CRIMINAL MISCHIEF       0.20      0.15      0.17       719
             CRIMINAL TRESPASS       0.32      0.24      0.28       364
   CUSTODY ARREST TRAFFIC WARR       0.36      0.24      0.29       107
           DISTURBANCE - OTHER       0.11      0.08      0.09       263
                           DWI       0.69      0.85      0.76       317
            FAMILY DISTURBANCE       0.76      0.70      0.73       615
                    HARASSMENT       0.34      0.22      0.27  

In [25]:
# Get predictions for all block groups
predictions = []
all_block_groups = crime_df['Census Block Group'].unique()

for block_group in all_block_groups:
    try:
        # Create proper one-hot encoded row
        block_enc = le_block.transform([str(block_group)])[0]
        
        # Get all location types for this block group
        location_types = crime_df[crime_df['Census Block Group'] == block_group]['Location Type'].unique()
        
        # Default to most common location if none found
        location = location_types[0] if len(location_types) > 0 else crime_df['Location Type'].mode()[0]
        
        # Create proper one-hot encoded input
        input_data = pd.DataFrame({
            'BlockGroupEncoded': [block_enc],
            'Location Type': [location]
        })
        input_data = pd.get_dummies(input_data).reindex(columns=X.columns, fill_value=0)
        
        # Get predictions
        probas = model.predict_proba(input_data)[0]
        top3 = probas.argsort()[-3:][::-1]
        
        predictions.append({
            'Census Block Group': block_group,
            'Top1_Offense': le_offense.inverse_transform([top3[0]])[0],
            'Top1_Probability': round(probas[top3[0]], 3),
            'Top2_Offense': le_offense.inverse_transform([top3[1]])[0],
            'Top2_Probability': round(probas[top3[1]], 3),
            'Top3_Offense': le_offense.inverse_transform([top3[2]])[0],
            'Top3_Probability': round(probas[top3[2]], 3)
        })
        
    except Exception as e:
        print(f"Error predicting for block group {block_group}: {str(e)}")
        continue

# Create final dataframe
result_df = pd.DataFrame(predictions)

In [26]:
result_df.head()


Unnamed: 0,Census Block Group,Top1_Offense,Top1_Probability,Top2_Offense,Top2_Probability,Top3_Offense,Top3_Probability
0,4530341000.0,FAMILY DISTURBANCE,1.0,WARRANT ARREST NON TRAFFIC,0.0,CUSTODY ARREST TRAFFIC WARR,0.0
1,4530024000.0,THEFT,0.72,BURGLARY NON RESIDENCE,0.26,HARASSMENT,0.02
2,4530011000.0,THEFT,1.0,WARRANT ARREST NON TRAFFIC,0.0,CUSTODY ARREST TRAFFIC WARR,0.0
3,4530011000.0,PUBLIC INTOXICATION,0.48,BURGLARY OF VEHICLE,0.42,DWI,0.1
4,4530325000.0,DWI,0.98,BURGLARY OF VEHICLE,0.02,WARRANT ARREST NON TRAFFIC,0.0


In [27]:
result_df.to_csv('../results/crime_rf_predictions.csv', index=False)

## LGBM

In [29]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

# Your existing preprocessing code (keep this)

le_offense = LabelEncoder()
le_block = LabelEncoder()
crime_df['OffenseCode'] = le_offense.fit_transform(crime_df['Highest Offense Description'])
crime_df['BlockGroupEncoded'] = le_block.fit_transform(crime_df['Census Block Group'].astype(str))

# --- NEW: Split data before aggregating ---
# Split at individual crime level (before grouping)
train_df, test_df = train_test_split(
    crime_df, 
    test_size=0.2, 
    random_state=42,
    stratify=crime_df['OffenseCode']  # Preserves class balance
)

# Aggregate training data only (to avoid data leakage)
grouped_train = train_df.groupby(['BlockGroupEncoded', 'Location Type']).agg(
    {'OffenseCode': lambda x: x.mode()[0]}
).reset_index()

# Prepare features (same as before)
X_train = pd.get_dummies(grouped_train[['BlockGroupEncoded', 'Location Type']])
y_train = grouped_train['OffenseCode']

# Train model (replace with your chosen classifier)
model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

# --- Evaluate on test data ---
# Aggregate test data same way
grouped_test = test_df.groupby(['BlockGroupEncoded', 'Location Type']).agg(
    {'OffenseCode': lambda x: x.mode()[0]}
).reset_index()

X_test = pd.get_dummies(grouped_test[['BlockGroupEncoded', 'Location Type']])
y_test = grouped_test['OffenseCode']

# Generate predictions
y_pred = model.predict(X_test)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=le_offense.classes_))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 341
[LightGBM] [Info] Number of data points in the train set: 13967, number of used features: 44
[LightGBM] [Info] Start training from score -3.550491
[LightGBM] [Info] Start training from score -2.813435
[LightGBM] [Info] Start training from score -3.471408
[LightGBM] [Info] Start training from score -2.219304
[LightGBM] [Info] Start training from score -4.639178
[LightGBM] [Info] Start training from score -1.852796
[LightGBM] [Info] Start training from score -2.662015
[LightGBM] [Info] Start training from score -3.455408
[LightGBM] [Info] Start training from score -4.799521
[LightGBM] [Info] Start training from score -3.580873
[LightGBM] [Info] Start training from score -3.497081
[LightGBM] [Info] Start training from score -2.947307


In [31]:
# Get predictions for all block groups
predictions = []
all_block_groups = crime_df['Census Block Group'].unique()

for block_group in all_block_groups:
    try:
        # Create proper one-hot encoded row
        block_enc = le_block.transform([str(block_group)])[0]
        
        # Get all location types for this block group
        location_types = crime_df[crime_df['Census Block Group'] == block_group]['Location Type'].unique()
        
        # Default to most common location if none found
        location = location_types[0] if len(location_types) > 0 else crime_df['Location Type'].mode()[0]
        
        # Create proper one-hot encoded input
        input_data = pd.DataFrame({
            'BlockGroupEncoded': [block_enc],
            'Location Type': [location]
        })
        input_data = pd.get_dummies(input_data).reindex(columns=X.columns, fill_value=0)
        
        # Get predictions
        probas = model.predict_proba(input_data)[0]
        top3 = probas.argsort()[-3:][::-1]
        
        predictions.append({
            'Census Block Group': block_group,
            'Top1_Offense': le_offense.inverse_transform([top3[0]])[0],
            'Top1_Probability': round(probas[top3[0]], 3),
            'Top2_Offense': le_offense.inverse_transform([top3[1]])[0],
            'Top2_Probability': round(probas[top3[1]], 3),
            'Top3_Offense': le_offense.inverse_transform([top3[2]])[0],
            'Top3_Probability': round(probas[top3[2]], 3)
        })
        
    except Exception as e:
        print(f"Error predicting for block group {block_group}: {str(e)}")
        continue

# Create final dataframe
result_df = pd.DataFrame(predictions)
result_df.head()

Unnamed: 0,Census Block Group,Top1_Offense,Top1_Probability,Top2_Offense,Top2_Probability,Top3_Offense,Top3_Probability
0,4530341000.0,FAMILY DISTURBANCE,0.645,ASSAULT W/INJURY-FAM/DATE VIOL,0.102,THEFT,0.085
1,4530024000.0,THEFT,0.623,BURGLARY NON RESIDENCE,0.241,BURGLARY OF VEHICLE,0.06
2,4530011000.0,THEFT,0.715,ASSAULT WITH INJURY,0.098,PUBLIC INTOXICATION,0.052
3,4530011000.0,PUBLIC INTOXICATION,0.307,DWI,0.295,BURGLARY OF VEHICLE,0.291
4,4530325000.0,DWI,0.78,BURGLARY OF VEHICLE,0.179,CRIMINAL MISCHIEF,0.015


In [32]:
result_df.to_csv('../results/crime_lgbm_predictions.csv', index=False)

## XGBoost

In [33]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Your existing preprocessing code (keep this)

le_offense = LabelEncoder()
le_block = LabelEncoder()
crime_df['OffenseCode'] = le_offense.fit_transform(crime_df['Highest Offense Description'])
crime_df['BlockGroupEncoded'] = le_block.fit_transform(crime_df['Census Block Group'].astype(str))

# --- NEW: Split data before aggregating ---
# Split at individual crime level (before grouping)
train_df, test_df = train_test_split(
    crime_df, 
    test_size=0.2, 
    random_state=42,
    stratify=crime_df['OffenseCode']  # Preserves class balance
)

# Aggregate training data only (to avoid data leakage)
grouped_train = train_df.groupby(['BlockGroupEncoded', 'Location Type']).agg(
    {'OffenseCode': lambda x: x.mode()[0]}
).reset_index()

# Prepare features (same as before)
X_train = pd.get_dummies(grouped_train[['BlockGroupEncoded', 'Location Type']])
y_train = grouped_train['OffenseCode']

# Train model (replace with your chosen classifier)
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)

# --- Evaluate on test data ---
# Aggregate test data same way
grouped_test = test_df.groupby(['BlockGroupEncoded', 'Location Type']).agg(
    {'OffenseCode': lambda x: x.mode()[0]}
).reset_index()

X_test = pd.get_dummies(grouped_test[['BlockGroupEncoded', 'Location Type']])
y_test = grouped_test['OffenseCode']

# Generate predictions
y_pred = model.predict(X_test)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=le_offense.classes_))

                                precision    recall  f1-score   support

ASSAULT W/INJURY-FAM/DATE VIOL       0.14      0.06      0.08       204
           ASSAULT WITH INJURY       0.24      0.28      0.26       498
                    AUTO THEFT       0.46      0.22      0.30       268
        BURGLARY NON RESIDENCE       0.37      0.30      0.33      1043
         BURGLARY OF RESIDENCE       0.39      0.13      0.20        99
           BURGLARY OF VEHICLE       0.62      0.56      0.59      1642
             CRIMINAL MISCHIEF       0.17      0.05      0.07       719
             CRIMINAL TRESPASS       0.31      0.09      0.14       364
   CUSTODY ARREST TRAFFIC WARR       0.42      0.10      0.17       107
           DISTURBANCE - OTHER       0.17      0.03      0.05       263
                           DWI       0.51      0.81      0.63       317
            FAMILY DISTURBANCE       0.70      0.69      0.70       615
                    HARASSMENT       0.86      0.19      0.31  

In [34]:
# Get predictions for all block groups
predictions = []
all_block_groups = crime_df['Census Block Group'].unique()

for block_group in all_block_groups:
    try:
        # Create proper one-hot encoded row
        block_enc = le_block.transform([str(block_group)])[0]
        
        # Get all location types for this block group
        location_types = crime_df[crime_df['Census Block Group'] == block_group]['Location Type'].unique()
        
        # Default to most common location if none found
        location = location_types[0] if len(location_types) > 0 else crime_df['Location Type'].mode()[0]
        
        # Create proper one-hot encoded input
        input_data = pd.DataFrame({
            'BlockGroupEncoded': [block_enc],
            'Location Type': [location]
        })
        input_data = pd.get_dummies(input_data).reindex(columns=X.columns, fill_value=0)
        
        # Get predictions
        probas = model.predict_proba(input_data)[0]
        top3 = probas.argsort()[-3:][::-1]
        
        predictions.append({
            'Census Block Group': block_group,
            'Top1_Offense': le_offense.inverse_transform([top3[0]])[0],
            'Top1_Probability': round(probas[top3[0]], 3),
            'Top2_Offense': le_offense.inverse_transform([top3[1]])[0],
            'Top2_Probability': round(probas[top3[1]], 3),
            'Top3_Offense': le_offense.inverse_transform([top3[2]])[0],
            'Top3_Probability': round(probas[top3[2]], 3)
        })
        
    except Exception as e:
        print(f"Error predicting for block group {block_group}: {str(e)}")
        continue

# Create final dataframe
result_df = pd.DataFrame(predictions)
result_df.head()

Unnamed: 0,Census Block Group,Top1_Offense,Top1_Probability,Top2_Offense,Top2_Probability,Top3_Offense,Top3_Probability
0,4530341000.0,FAMILY DISTURBANCE,0.732,CRIMINAL MISCHIEF,0.072,BURGLARY OF VEHICLE,0.065
1,4530024000.0,THEFT,0.698,BURGLARY NON RESIDENCE,0.161,CRIMINAL MISCHIEF,0.053
2,4530011000.0,THEFT,0.73,ASSAULT WITH INJURY,0.067,PUBLIC INTOXICATION,0.048
3,4530011000.0,DWI,0.682,BURGLARY OF VEHICLE,0.144,PUBLIC INTOXICATION,0.095
4,4530325000.0,DWI,0.814,BURGLARY OF VEHICLE,0.156,CRIMINAL MISCHIEF,0.011


In [35]:
result_df.to_csv('../results/crime_xgboost_predictions.csv', index=False)

## Logistic Regression

In [36]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# Your existing preprocessing code (keep this)

le_offense = LabelEncoder()
le_block = LabelEncoder()
crime_df['OffenseCode'] = le_offense.fit_transform(crime_df['Highest Offense Description'])
crime_df['BlockGroupEncoded'] = le_block.fit_transform(crime_df['Census Block Group'].astype(str))

# --- NEW: Split data before aggregating ---
# Split at individual crime level (before grouping)
train_df, test_df = train_test_split(
    crime_df, 
    test_size=0.2, 
    random_state=42,
    stratify=crime_df['OffenseCode']  # Preserves class balance
)

# Aggregate training data only (to avoid data leakage)
grouped_train = train_df.groupby(['BlockGroupEncoded', 'Location Type']).agg(
    {'OffenseCode': lambda x: x.mode()[0]}
).reset_index()

# Prepare features (same as before)
X_train = pd.get_dummies(grouped_train[['BlockGroupEncoded', 'Location Type']])
y_train = grouped_train['OffenseCode']

# Train model (replace with your chosen classifier)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# --- Evaluate on test data ---
# Aggregate test data same way
grouped_test = test_df.groupby(['BlockGroupEncoded', 'Location Type']).agg(
    {'OffenseCode': lambda x: x.mode()[0]}
).reset_index()

X_test = pd.get_dummies(grouped_test[['BlockGroupEncoded', 'Location Type']])
y_test = grouped_test['OffenseCode']

# Generate predictions
y_pred = model.predict(X_test)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=le_offense.classes_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                                precision    recall  f1-score   support

ASSAULT W/INJURY-FAM/DATE VIOL       0.22      0.03      0.06       204
           ASSAULT WITH INJURY       0.21      0.38      0.27       498
                    AUTO THEFT       0.57      0.24      0.33       268
        BURGLARY NON RESIDENCE       0.36      0.35      0.35      1043
         BURGLARY OF RESIDENCE       0.00      0.00      0.00        99
           BURGLARY OF VEHICLE       0.48      0.46      0.47      1642
             CRIMINAL MISCHIEF       0.14      0.01      0.01       719
             CRIMINAL TRESPASS       0.19      0.03      0.06       364
   CUSTODY ARREST TRAFFIC WARR       0.00      0.00      0.00       107
           DISTURBANCE - OTHER       0.30      0.02      0.04       263
                           DWI       0.44      0.96      0.60       317
            FAMILY DISTURBANCE       0.61      0.71      0.65       615
                    HARASSMENT       0.96      0.18      0.30  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
# Get predictions for all block groups
predictions = []
all_block_groups = crime_df['Census Block Group'].unique()

for block_group in all_block_groups:
    try:
        # Create proper one-hot encoded row
        block_enc = le_block.transform([str(block_group)])[0]
        
        # Get all location types for this block group
        location_types = crime_df[crime_df['Census Block Group'] == block_group]['Location Type'].unique()
        
        # Default to most common location if none found
        location = location_types[0] if len(location_types) > 0 else crime_df['Location Type'].mode()[0]
        
        # Create proper one-hot encoded input
        input_data = pd.DataFrame({
            'BlockGroupEncoded': [block_enc],
            'Location Type': [location]
        })
        input_data = pd.get_dummies(input_data).reindex(columns=X.columns, fill_value=0)
        
        # Get predictions
        probas = model.predict_proba(input_data)[0]
        top3 = probas.argsort()[-3:][::-1]
        
        predictions.append({
            'Census Block Group': block_group,
            'Top1_Offense': le_offense.inverse_transform([top3[0]])[0],
            'Top1_Probability': round(probas[top3[0]], 3),
            'Top2_Offense': le_offense.inverse_transform([top3[1]])[0],
            'Top2_Probability': round(probas[top3[1]], 3),
            'Top3_Offense': le_offense.inverse_transform([top3[2]])[0],
            'Top3_Probability': round(probas[top3[2]], 3)
        })
        
    except Exception as e:
        print(f"Error predicting for block group {block_group}: {str(e)}")
        continue

# Create final dataframe
result_df = pd.DataFrame(predictions)
result_df.head()

Unnamed: 0,Census Block Group,Top1_Offense,Top1_Probability,Top2_Offense,Top2_Probability,Top3_Offense,Top3_Probability
0,4530341000.0,FAMILY DISTURBANCE,0.623,BURGLARY OF VEHICLE,0.154,ASSAULT W/INJURY-FAM/DATE VIOL,0.056
1,4530024000.0,THEFT,0.682,BURGLARY NON RESIDENCE,0.121,CRIMINAL MISCHIEF,0.067
2,4530011000.0,ASSAULT WITH INJURY,0.367,THEFT,0.234,BURGLARY OF VEHICLE,0.098
3,4530011000.0,DWI,0.462,BURGLARY OF VEHICLE,0.386,CRIMINAL MISCHIEF,0.026
4,4530325000.0,DWI,0.56,BURGLARY OF VEHICLE,0.259,CUSTODY ARREST TRAFFIC WARR,0.041


In [38]:
result_df.to_csv('../results/crime_log_reg_predictions.csv', index=False)