In [ ]:
!pip install pycaret[full]

# Import libraries


In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
drive.mount('/content/drive')

# Load data


In [ ]:
df = pd.read_csv('/content/drive/MyDrive/dataset/weatherHistory.csv')


# Drop unnecessary columns


In [ ]:
drop_cols = ['Formatted Date', 'Loud Cover', 'Precip Type', 'Daily Summary']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True, errors='ignore')


# Handle missing values


In [ ]:
df.fillna(df.mean(numeric_only=True), inplace=True)


# Group similar weather summaries to reduce label noise


In [ ]:
def simplify_summary(summary):
    summary = summary.lower()
    if 'cloudy' in summary:
        return 'cloudy'
    elif 'clear' in summary:
        return 'clear'
    elif 'rain' in summary:
        return 'rain'
    elif 'fog' in summary:
        return 'foggy'
    elif 'drizzle' in summary:
        return 'drizzle'
    elif 'snow' in summary:
        return 'snow'
    else:
        return 'other'

df['Simple_Summary'] = df['Summary'].apply(simplify_summary)


# Label encode target


In [ ]:
le = LabelEncoder()
df['Summary_encoded'] = le.fit_transform(df['Simple_Summary'])


# Drop the original summary columns


In [ ]:
df.drop(columns=['Summary', 'Simple_Summary'], inplace=True)


# Drop classes with fewer than 50 samples


In [ ]:
class_counts = df['Summary_encoded'].value_counts()
valid_classes = class_counts[class_counts >= 50].index
df_filtered = df[df['Summary_encoded'].isin(valid_classes)]


# Feature engineering


In [ ]:
df_filtered.loc[:, 'ApparentTempDiff'] = df_filtered['Apparent Temperature (C)'] - df_filtered['Temperature (C)']
df_filtered.loc[:, 'Humidity*Pressure'] = df_filtered['Humidity'] * df_filtered['Pressure (millibars)']


# PyCaret Classification


In [ ]:
from pycaret.classification import *

clf = setup(data=df_filtered,
            target='Summary_encoded',
            session_id=123,
            fix_imbalance=True,
            fix_imbalance_method='smote',
            verbose=False)


# Compare models and get the best one


In [ ]:
best_model = compare_models()


# Tune the best model


In [ ]:
tuned_model = tune_model(best_model)


# Evaluate tuned_model directly:


In [ ]:
evaluate_model(tuned_model)

# Predict on the same data


In [ ]:
predictions = predict_model(tuned_model, data=df_filtered)


In [ ]:
df_sampled = df_filtered.sample(frac=0.3, random_state=123)


# Performance metrics


In [ ]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y_true = df_filtered['Summary_encoded']
y_pred = predictions['prediction_label']

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, average='weighted')
rec = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(" Performance Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")


# Confusion Matrix


In [ ]:
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 6))
sns.heatmap(cm, annot=False, cmap='Blues')
plt.title('Confusion Matrix Heatmap')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [ ]:
import json

# Replace with your actual notebook filename
notebook_path = "your_notebook.ipynb"

with open(notebook_path, "r", encoding="utf-8") as f:
    nb = json.load(f)

# Clean broken widget metadata
for cell in nb.get("cells", []):
    metadata = cell.get("metadata", {})
    widgets = metadata.get("widgets", {})
    if isinstance(widgets, dict) and "state" not in widgets:
        del metadata["widgets"]

with open(notebook_path, "w", encoding="utf-8") as f:
    json.dump(nb, f, indent=2)
