

#Import All Required Libraries



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings


warnings.filterwarnings('ignore')

#Importing Datasets

In [2]:
df1=pd.read_csv('forest1.csv')
df2=pd.read_csv('forest2.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'forest1.csv'

#Identifying the dataypes of different columns


In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
df2.head()

In [None]:
df2.info()

#Creating a single dataset by assigning Regions

In [None]:
df1['Region'] = 1
df1.head()

In [None]:
df2['Region'] = 2
df2.head()

In [None]:
df = pd.concat([df1, df2], ignore_index=True)
df

In [None]:
df.info()

**OVERALL ANALYSIS**

The dataset contains **15 columns** and **244 rows**, representing meteorological and environmental observations across different regions and time periods. The data types are distributed as follows:

Integer columns (7): day, month, year, Temperature, RH, Ws, and Region — these capture date information and basic weather parameters such as temperature, relative humidity, wind speed, and the region code.

Float columns (5): Rain, FFMC, DMC, ISI, and BUI — these represent continuous meteorological indices used in fire weather modeling.

Object columns (3): DC, FWI, and Classes — among these, DC and FWI are likely numerical values stored as text that should be converted to float, while Classes indicates the categorical fire occurrence label.

Notably, the Classes column has one missing entry (243 non-null), which may require imputation or removal during preprocessing.

#Converting to proper datatype

In [None]:
df['DC'] = pd.to_numeric(df['DC'], errors='coerce')
df['FWI'] = pd.to_numeric(df['FWI'], errors='coerce')

In [None]:
df.info()

#Descriptive Statistics of the Columns


In [None]:
df.describe()

#Identifying & Handling Missing Values

In [None]:
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100


missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage': missing_percent
}).sort_values(by='Missing Percentage', ascending=False)

missing_df

**Since there is only one missing value in 3 features, we can safely ignore it without affecting the data much**

In [None]:
df.dropna(inplace=True)
df.isna().sum()

#Identifying & Handling Duplicate Values

In [None]:
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

df.drop_duplicates(inplace=True)

df.shape

#Identifying columns inconsistency

In [None]:
df.columns

In [None]:
df.columns=df.columns.str.strip()
df.columns

In [None]:
for col in df.columns:
    print(f"\n--- {col} ---")
    print(df[col].value_counts(dropna=False).head(10))


In [None]:
df['Classes']=df['Classes'].str.strip()
df['Classes']=df['Classes'].map({'fire':1,'not fire':0})
df['Classes'].value_counts()

#Identifying and handling Outliers

In [None]:
def count_iqr_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ((series < lower_bound) | (series > upper_bound)).sum()

outlier_counts = {col: count_iqr_outliers(df[col]) for col in df.select_dtypes(include=['number'])}
pd.DataFrame.from_dict(outlier_counts, orient='index', columns=['Outlier Count'])

In [None]:
df.plot(kind='box', subplots=True, layout=(5,3), figsize=(20,15), patch_artist=True)
plt.tight_layout()
plt.show()

**Seems there are outliers according to the IQR method. But we are not sure of it yet if we should remove these. As we have a problem with detection outliers itself i.e forest fires which happend rarely, we decided not to do anything for these outliers that we got from IQR method**

#Checking the Distribution of the Dataset

In [None]:
df.hist(bins=50, figsize=(20,15), ec = 'b')
plt.title("Distibution of Dataset",fontsize=13)
plt.tight_layout()
plt.show()


**day**: This is uniformly distributed, it itself is usually a weak predictor of forest fires (day 5 isn't inherently more dangerous than day 25). It can be dropped.

**month**: This is a categorical feature with four distinct values (June, July, August, September). This is a strong and important feature as it captures the seasonal progression of fire danger (e.g., August is typically hotter and drier than June).

**year**: This is a constant feature (all 2012). It has zero variance and provides no predictive information. It must be dropped.

**Temperature**: This feature is normally distributed, centered around 32-35°C. This is a stable predictor for most models.

**Rain**: This is highly right-skewed and zero-inflated. The vast majority of entries are 0.0 (no rain), which is typical for a fire season. The presence (Rain > 0) vs. absence (Rain = 0) of rain is likely a more powerful predictor than the specific amount.

**FFMC (Fine Fuel Moisture Code)**: This is strongly left-skewed. Most values are very high (80-95), indicating that surface fuels (grass, leaves) are very dry and highly combustible on most days.

**DMC (Duff Moisture Code)**: This is right-skewed. Most values are low, meaning the deeper duff layer is often moist. The long tail of high values represents extended dry periods where this deep fuel dries out, leading to more intense fires.

**DC (Drought Code)**: This is highly right-skewed with a very long tail. Most values are low (0-50), indicating that severe, long-term drought is uncommon. The long tail (past 200) represents rare but critical periods of extreme drought, which would make fires much more intense and difficult to control.

**ISI (Initial Spread Index)**: This is right-skewed. Similar to wind speed, most days have a low potential for fire spread, but a few days have a very high potential, which is critical to predict.

**BUI (Buildup Index)**: This is right-skewed and looks similar to DMC. It represents the total fuel available for combustion and shows that while this is low on most days, it can build up significantly during dry spells.

**FWI (Fire Weather Index)**: This is highly right-skewed. As the final composite index representing total fire intensity, this distribution shows that most days have a low-to-moderate fire risk (values 0-10). The long tail represents the few critical days with very high fire danger.

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True).round(2),
            annot=True,
            fmt='.2f',
            cmap='coolwarm')
plt.title("Correlation Heatmap", fontsize=14)
plt.tight_layout()
plt.show()


**Correlation with Target (Classes)**
**High Positive Correlation:** FFMC (0.77), ISI (0.74), FWI (0.72), DMC (0.59), and DC (0.51) all have a strong-to-moderate positive relationship. As these values go up, the chance of fire goes up.

**Moderate/Low Negative Correlation:** RH (-0.43) and Rain (-0.38) have a moderate-to-low negative relationship. As these values go up (more humidity or rain), the chance of fire goes down.

**Very Low / No Correlation:** Temperature (0.20), day (0.20), Region (0.16), Ws (0.07), month (0.02), year (-0.00).


**Multicollinearity (Correlation Between Predictors)**

**Extremely High Correlation (Redundant):**

**BUI and DMC (0.98):** Nearly identical. Can be dropped.

**BUI and DC (0.94):** Nearly identical.

**FWI and ISI (0.92):** FWI is heavily dependent on ISI.

**DC and DMC (0.88):** Extremely high correlation.

**FWI and BUI (0.86):** Extremely high correlation.

**ISI and FFMC (0.74):** Very high correlation.

**FWI and DC (0.74):** Very high correlation.

**High-to-Moderate Correlation:**

**FWI and FFMC (0.69)**

**DMC and ISI (0.68)**

**FFMC and RH (-0.64)**

**DMC and FFMC (0.60)**

In [None]:
fire_counts_monthly = df.groupby(['Region', 'month', 'Classes']).size().unstack(fill_value=0)

fire_counts_monthly.loc[1].plot(kind='bar', figsize=(10, 6))
plt.title('Fire and Not Fire Counts by Month in Region 1')
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(title='Classes')
plt.show()

fire_counts_monthly.loc[2].plot(kind='bar', figsize=(10, 6))
plt.title('Fire and Not Fire Counts by Month in Region 2')
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(title='Classes')
plt.show()

**Months 7 and 8 seems to have more higher chances of fire**

In [None]:
fire_counts = df.groupby(['Region', 'Classes']).size().unstack(fill_value=0)

fire_counts.plot(kind='bar', figsize=(8, 6))
plt.title('Fire and Not Fire Counts by Region')
plt.xlabel('Region')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(title='Classes')
plt.show()

**Region 2 has more fires as compared to region Region 1**

In [None]:
class_counts = df['Classes'].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%')
plt.title('Proportion of Fire and Not Fire Classes')
plt.show()

In [None]:
df.drop(['day','FWI','BUI','DC','ISI','DMC','year'], axis=1, inplace=True)

#Splitting dataset

In [None]:
X = df.drop('Classes',axis=1)
y= df['Classes']
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=42)

#Model Training

In [None]:
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM Classifier": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
}

results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', StandardScaler()),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_val)
    y_pred_proba = pipeline.predict_proba(X_val)[:, 1]

    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }

    print(f"{name}")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1-Score : {f1:.4f}")
    print(f"ROC-AUC  : {roc_auc:.4f}\n")

results_df = pd.DataFrame(results).T.sort_values(by='F1-Score', ascending=False)
print("\nModel Performance Comparison")
print(results_df)


In [None]:
X = df.drop('Classes', axis=1)
y = df['Classes']

models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM Classifier": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss'), # Added eval_metric
}

results = {}

cv_strategy = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

print("Running Cross-Validation...")

for name, model in models.items():

    pipeline = Pipeline(steps=[
        ('preprocessor', StandardScaler()),
        ('classifier', model)
    ])

    scores = cross_val_score(pipeline, X, y, cv=cv_strategy, scoring='f1', n_jobs=-1)

    results[name] = {
        'F1-Mean': scores.mean(),
        'F1-StdDev': scores.std()
    }

    print(f"{name}: Mean F1-Score = {scores.mean():.4f} (StdDev = {scores.std():.4f})")

results_df = pd.DataFrame(results).T.sort_values(by='F1-Mean', ascending=False)
print("\nModel Performance Comparison (Cross-Validated)")
print(results_df)

In [None]:
final_pipeline = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

final_pipeline.fit(X, y)

feature_names = X.columns

importances = final_pipeline.named_steps['classifier'].feature_importances_

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance (Random Forest)')
plt.show()

In [None]:
final_pipeline = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('classifier', XGBClassifier(random_state=42))
])

final_pipeline.fit(X, y)

feature_names = X.columns

importances = final_pipeline.named_steps['classifier'].feature_importances_

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance (Random Forest)')
plt.show()