In [None]:
import pandas as pd

# Load dataset
file_path = "cloudburst_data.csv"  # Tumhare original dataset ka path
df = pd.read_csv(file_path)

# Select required columns
required_columns = [
    "Rainfall", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm",
    "Cloud9am", "Cloud3pm", "WindGustSpeed", "Temperature9am", "Temperature3pm",
    "WindDirection", "CloudBurstToday", "CloudBurstTomorrow"
]

# Check if all required columns exist in dataset
df = df[[col for col in required_columns if col in df.columns]]

# Convert 'Yes' and 'No' to 1 and 0 in CloudBurstToday & CloudBurstTomorrow
if "CloudBurstToday" in df.columns:
    df["CloudBurstToday"] = df["CloudBurstToday"].map({"Yes": True, "No": False})

if "CloudBurstTomorrow" in df.columns:
    df["CloudBurstTomorrow"] = df["CloudBurstTomorrow"].map({"Yes": 1, "No": 0})

# Remove missing values (NaN)
df_cleaned = df.dropna()

# Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()

# Handle Outliers using IQR (Interquartile Range Method)
# Q1 = df_cleaned.quantile(0.25)
# Q3 = df_cleaned.quantile(0.75)
# IQR = Q3 - Q1

# # Defining the valid range
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# Removing outliers (except categorical column 'WindDirection')
# numerical_cols = [col for col in df_cleaned.columns if df_cleaned[col].dtype != 'object']
# df_cleaned = df_cleaned[~((df_cleaned[numerical_cols] < lower_bound) | 
#                            (df_cleaned[numerical_cols] > upper_bound)).any(axis=1)]

# Separate data for today's and tomorrow's prediction
today_data = df_cleaned.drop(columns=["CloudBurstTomorrow"])  # Sirf aaj ka prediction
tomorrow_data = df_cleaned.drop(columns=["CloudBurstToday"])  # Sirf kal ka prediction

# Save both datasets separately
today_data.to_csv("today_data.csv", index=False)
tomorrow_data.to_csv("tomorrow_data.csv", index=False)

print("✅ Data Cleaning Complete! \n- Today’s data saved as 'today_data.csv' \n- Tomorrow’s data saved as 'tomorrow_data.csv'")


✅ Data Cleaning Complete! 
- Today’s data saved as 'today_data.csv' 
- Tomorrow’s data saved as 'tomorrow_data.csv'


In [16]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load datasets
df_today = pd.read_csv("today_data.csv")
df_tomorrow = pd.read_csv("tomorrow_data.csv")

# Remove WindDirection if it exists
if "WindDirection" in df_today.columns:
    df_today.drop(columns=["WindDirection"], inplace=True)

if "WindDirection" in df_tomorrow.columns:
    df_tomorrow.drop(columns=["WindDirection"], inplace=True)

# Preprocess function
def preprocess_data(df, target_col):
    X = df.drop(columns=[target_col])  # Features
    y = df[target_col]  # Target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Process data
X_train_today, X_test_today, y_train_today, y_test_today = preprocess_data(df_today, "CloudBurstToday")
X_train_tomorrow, X_test_tomorrow, y_train_tomorrow, y_test_tomorrow = preprocess_data(df_tomorrow, "CloudBurstTomorrow")

# Train models
model_today = DecisionTreeClassifier(random_state=42)
model_today.fit(X_train_today, y_train_today)

model_tomorrow = DecisionTreeClassifier(random_state=42)
model_tomorrow.fit(X_train_tomorrow, y_train_tomorrow)

# Save trained models
joblib.dump(model_today, "cloudburst_today_model.pkl")
joblib.dump(model_tomorrow, "cloudburst_tomorrow_model.pkl")

# Save feature names
joblib.dump(X_train_today.columns.tolist(), "feature_names_today.pkl")
joblib.dump(X_train_tomorrow.columns.tolist(), "feature_names_tomorrow.pkl")

print("\n✅ Models trained & saved successfully!")



✅ Models trained & saved successfully!
