# 📌 Phase 3: Model Training for Naxalite Hideout Prediction

In this notebook, we'll prepare the data and train a classification model to predict likely Naxalite hideout locations using the engineered features.

In [1]:
# ✅ Step 1: Load Feature-Enhanced Dataset
import pandas as pd

df = pd.read_csv("../Data/naxal_hideouts_features.csv")
df['label'] = 1  # Mark real hideouts as positive samples
df.head()


Unnamed: 0,name,latitude,longitude,date_range,elevation,distance_to_village,label
0,Vashudev rao@takkapali satish,19.12562,81.14555,2025-06-01_to_2025-06-10,479.0,32.243497,1
1,Ranita@Jaymati,19.38255,80.88472,2025-06-01_to_2025-06-10,475.0,7.747186,1
2,Ranita@Jaymati,19.37888,80.96871,2025-06-01_to_2025-06-10,788.0,3.752889,1
3,Ranita@Jaymati,19.42753,80.86589,2025-06-01_to_2025-06-10,365.0,12.317445,1
4,Vashudev rao@takkapali satish,19.1492,81.16334,2025-06-11_to_2025-06-20,553.0,31.57748,1


## ✅ Step 2: Generate Negative Samples
These are random nearby points that we assume are **not** hideouts.

In [2]:
import numpy as np

def generate_negative_samples(df, n_samples=100):
    min_lat, max_lat = df['latitude'].min(), df['latitude'].max()
    min_lon, max_lon = df['longitude'].min(), df['longitude'].max()

    samples = []
    for _ in range(n_samples):
        lat = np.random.uniform(min_lat, max_lat)
        lon = np.random.uniform(min_lon, max_lon)
        samples.append([lat, lon])
    return pd.DataFrame(samples, columns=['latitude', 'longitude'])

neg_df = generate_negative_samples(df, n_samples=len(df))
neg_df['elevation'] = df['elevation'].sample(frac=1).values  # Optional: sample elevation
neg_df['distance_to_village'] = df['distance_to_village'].sample(frac=1).values
neg_df['label'] = 0
neg_df.head()

Unnamed: 0,latitude,longitude,elevation,distance_to_village,label
0,19.506445,81.091492,465.0,5.923071,0
1,19.166057,80.752589,455.0,50.167818,0
2,19.371264,81.088659,620.0,12.317445,0
3,19.230677,80.865574,365.0,51.708635,0
4,19.393032,80.727253,479.0,51.553516,0


## ✅ Step 3: Combine and Shuffle the Dataset

In [3]:
combined_df = pd.concat([df, neg_df], ignore_index=True).sample(frac=1).reset_index(drop=True)
combined_df.head()

Unnamed: 0,name,latitude,longitude,date_range,elevation,distance_to_village,label
0,,19.761459,81.102674,,671.0,8.204942,0
1,,19.510531,80.719485,,380.0,50.036673,0
2,"Bhaskar@ rajmohan,rajman",19.80027,80.99176,2025-06-30_to_2025-07-08,671.0,50.036673,1
3,Ranita@Jaymati,19.375,80.87648,2025-06-30_to_2025-07-08,455.0,8.204942,1
4,Vashudev rao@takkapali satish,19.12562,81.14555,2025-06-01_to_2025-06-10,479.0,32.243497,1


## ✅ Step 4: Train-Test Split

In [4]:
from sklearn.model_selection import train_test_split

features = ['latitude', 'longitude', 'elevation', 'distance_to_village']
X = combined_df[features]
y = combined_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## ✅ Step 5: Train Random Forest Model

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8333333333333334
[[4 0]
 [1 1]]
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      0.50      0.67         2

    accuracy                           0.83         6
   macro avg       0.90      0.75      0.78         6
weighted avg       0.87      0.83      0.81         6



## ✅ Step 6: Save the Trained Model

In [6]:
import joblib

joblib.dump(model, "../Models/rf_model.pkl")
print("✅ Model saved to ../Models/rf_model.pkl")

✅ Model saved to ../Models/rf_model.pkl
