In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [None]:
# ✅ Load and rename datasets

train_path = "/content/trainset.csv"
val_path = "/content/valset.csv"
test_path = "/content/testset.csv"



trainset = pd.read_csv(train_path)
valset = pd.read_csv(val_path)
testset = pd.read_csv(test_path)



In [None]:
trainset.head()


Unnamed: 0,cell_id,date,crime
0,-140129148,2019-01-02,0
1,-140129148,2019-01-03,0
2,-140129148,2019-01-04,0
3,-140129148,2019-01-05,0
4,-140129148,2019-01-06,0


In [None]:
trainset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376780 entries, 0 to 1376779
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   cell_id  1376780 non-null  object
 1   date     1376780 non-null  object
 2   crime    1376780 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 31.5+ MB


In [None]:
# ⏱ Convert date and add features
for df in [trainset, valset, testset]:
    df['date'] = pd.to_datetime(df['date'])
    df['dayofweek'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month

# 🎯 Features and label
features = ['cell_id', 'dayofweek', 'month']
target = 'crime'

#X_train, y_train = trainset[features], trainset[target]
X_val, y_val = valset[features], valset[target]
X_test, y_test = testset[features], testset[target]


train_sample = trainset.sample(n=200000, random_state=42)  # ~15% of data
X_train_sample = train_sample[features]
y_train_sample = train_sample[target]


In [None]:
# 🧼 Preprocessing
preprocessor = ColumnTransformer([
    ('cell', OneHotEncoder(handle_unknown='ignore'), ['cell_id']),
    ('date', StandardScaler(), ['dayofweek', 'month'])
])

# 🌲 Random Forest Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])


In [None]:
# 🧠 Train the model
pipeline.fit(X_train_sample, y_train_sample)




In [None]:
# ✅ Predict probabilities for val and test
valset['predicted_proba'] = pipeline.predict_proba(X_val)[:, 1]
testset['predicted_proba'] = pipeline.predict_proba(X_test)[:, 1]
