In [5]:
pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
   ---------------------------------------- 0.0/238.4 kB ? eta -:--:--
   --- ----------------------------------- 20.5/238.4 kB 640.0 kB/s eta 0:00:01
   ------------- -------------------------- 81.9/238.4 kB 1.1 MB/s eta 0:00:01
   ---------------------------------------- 238.4/238.4 kB 2.1 MB/s eta 0:00:00
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV

# Load dataset
df = pd.read_csv("hyderabad_zone_prediction_large.csv")

# Encoding categorical variables
label_encoders = {}
categorical_columns = ["Land Use", "Soil Type", "Hazard Risk"]

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Normalizing numerical features
scaler = StandardScaler()
df[["Latitude", "Longitude", "Population Density"]] = scaler.fit_transform(df[["Latitude", "Longitude", "Population Density"]])

# Features and target
X = df.drop(columns=["Predicted Zone", "Location"])  # Dropping 'Location' as it's correlated with Lat/Long
y = df["Predicted Zone"]

y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)

# Check class distribution
print("Class distribution before resampling:", pd.Series(y).value_counts())

# Handle class imbalance with RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [1, 2, 5]  # Helps with imbalanced data
}

xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Optimized Accuracy:", accuracy_score(y_test, y_pred))

# Function to predict zone
def predict_zone(latitude, longitude, land_use, soil_type, hazard_risk, population_density):
    input_data = pd.DataFrame({
        "Latitude": [latitude],
        "Longitude": [longitude],
        "Land Use": [label_encoders["Land Use"].transform([land_use])[0] if land_use in label_encoders["Land Use"].classes_ else 0],
        "Soil Type": [label_encoders["Soil Type"].transform([soil_type])[0] if soil_type in label_encoders["Soil Type"].classes_ else 0],
        "Hazard Risk": [label_encoders["Hazard Risk"].transform([hazard_risk])[0] if hazard_risk in label_encoders["Hazard Risk"].classes_ else 0],
        "Population Density": [population_density]
    })
    
    input_data[["Latitude", "Longitude", "Population Density"]] = scaler.transform(input_data[["Latitude", "Longitude", "Population Density"]])
    prediction = best_model.predict(input_data)
    return y_encoder.inverse_transform(prediction)[0]

# Example prediction
predicted_zone = predict_zone(17.432, 78.446, "Residential", "Red Sandy", "Low", 15000)
print("Predicted Zone:", predicted_zone)


Class distribution before resampling: 3    7
7    5
2    5
4    4
0    4
5    3
6    3
1    1
Name: count, dtype: int64


Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



Optimized Accuracy: 0.6666666666666666
Predicted Zone: Commercial


In [10]:
import joblib

# Save the trained model
joblib.dump(best_model, "xgb_zone_prediction_model3.pkl")

# Save the scaler
joblib.dump(scaler, "scaler3.pkl")

# Save label encoders
joblib.dump(label_encoders, "label_encoders3.pkl")

# Save target label encoder
joblib.dump(y_encoder, "y_encoder3.pkl")

print("Model and encoders saved successfully!")


Model and encoders saved successfully!
