In [21]:
import pandas as pd
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')


data = pd.read_csv('data/train.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [22]:
# Display summary statistics for numerical columns
print("\nSummary Statistics:")
print(data.describe())


Summary Statistics:
                id          day     pressure      maxtemp  temparature  \
count  2190.000000  2190.000000  2190.000000  2190.000000  2190.000000   
mean   1094.500000   179.948402  1013.602146    26.365799    23.953059   
std     632.342866   105.203592     5.655366     5.654330     5.222410   
min       0.000000     1.000000   999.000000    10.400000     7.400000   
25%     547.250000    89.000000  1008.600000    21.300000    19.300000   
50%    1094.500000   178.500000  1013.000000    27.800000    25.500000   
75%    1641.750000   270.000000  1017.775000    31.200000    28.400000   
max    2189.000000   365.000000  1034.600000    36.000000    31.500000   

           mintemp     dewpoint     humidity        cloud     sunshine  \
count  2190.000000  2190.000000  2190.000000  2190.000000  2190.000000   
mean     22.170091    20.454566    82.036530    75.721918     3.744429   
std       5.059120     5.288406     7.800654    18.026498     3.626327   
min       4.0000

In [23]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64


In [24]:

# Display the first few rows of the dataset
print("\nFirst Few Rows:")
print(data.head())



First Few Rows:
   id  day  pressure  maxtemp  temparature  mintemp  dewpoint  humidity  \
0   0    1    1017.4     21.2         20.6     19.9      19.4      87.0   
1   1    2    1019.5     16.2         16.9     15.8      15.4      95.0   
2   2    3    1024.1     19.4         16.1     14.6       9.3      75.0   
3   3    4    1013.4     18.1         17.8     16.9      16.8      95.0   
4   4    5    1021.8     21.3         18.4     15.2       9.6      52.0   

   cloud  sunshine  winddirection  windspeed  rainfall  
0   88.0       1.1           60.0       17.2         1  
1   91.0       0.0           50.0       21.9         1  
2   47.0       8.3           70.0       18.1         1  
3   95.0       0.0           60.0       35.6         1  
4   45.0       3.6           40.0       24.8         0  


In [25]:
# Display the column names
print("\nColumn Names:")
print(data.columns.tolist())


Column Names:
['id', 'day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed', 'rainfall']


In [26]:
# Drop columns 'id' and 'day', and set 'rainfall' as the label column
X = data.drop(columns=['id', 'day', 'rainfall'])
y = data['rainfall']


In [27]:
from sklearn.model_selection import train_test_split

# Split data into train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Split temp into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Check for balance in the three sets
print("Train set class distribution:")
print(y_train.value_counts(normalize=True))

print("\nValidation set class distribution:")
print(y_val.value_counts(normalize=True))

print("\nTest set class distribution:")
print(y_test.value_counts(normalize=True))

Train set class distribution:
rainfall
1    0.753425
0    0.246575
Name: proportion, dtype: float64

Validation set class distribution:
rainfall
1    0.753049
0    0.246951
Name: proportion, dtype: float64

Test set class distribution:
rainfall
1    0.753799
0    0.246201
Name: proportion, dtype: float64


In [28]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from imblearn.combine import SMOTEENN
from catboost import CatBoostClassifier

# Update models with advanced algorithms
# Handle class imbalance
# Combination of over- and under-sampling using SMOTEENN


smoteenn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smoteenn.fit_resample(X_train, y_train)

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0)
}

# Train and validate each model
for name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    y_val_pred = model.predict(X_val)
    print(f"Model: {name}")
    print(classification_report(y_val, y_val_pred))
    print("-" * 50)




Model: Decision Tree
              precision    recall  f1-score   support

           0       0.54      0.68      0.60        81
           1       0.89      0.81      0.85       247

    accuracy                           0.78       328
   macro avg       0.72      0.75      0.73       328
weighted avg       0.80      0.78      0.79       328

--------------------------------------------------
Model: Random Forest
              precision    recall  f1-score   support

           0       0.62      0.69      0.65        81
           1       0.89      0.86      0.88       247

    accuracy                           0.82       328
   macro avg       0.76      0.78      0.77       328
weighted avg       0.83      0.82      0.82       328

--------------------------------------------------
Model: Gradient Boosting
              precision    recall  f1-score   support

           0       0.62      0.69      0.65        81
           1       0.89      0.86      0.88       247

    accuracy 

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

# Step 1: Select the top 3 models (already stored in `top_models`)
top_models = [
    ("Random Forest", models["Random Forest"]),
    ("Gradient Boosting", models["Gradient Boosting"]),
    ("CatBoost", models["CatBoost"]),
]

# Step 2: Perform hyperparameter tuning for the top 3 models
tuned_models = {}
param_grids = {
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5],
    },
    "Gradient Boosting": {
        "n_estimators": [100, 200],
        "learning_rate": [0.1, 0.01],
        "max_depth": [3, 5],
    },
    "CatBoost": {
        "iterations": [100, 200],
        "learning_rate": [0.1, 0.01],
        "depth": [6, 8, 10],
    },
}

for name, model in top_models:
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(model, param_grids[name], scoring="f1_weighted", cv=3, n_jobs=-1)
    grid_search.fit(X_train_resampled, y_train_resampled)
    tuned_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

# Step 3: Create an ensemble model using the tuned models

ensemble_model = VotingClassifier(
    estimators=top_models,
    voting="soft"
)
ensemble_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Test the ensemble model on the test set
y_test_pred = ensemble_model.predict(X_test)
print("\nEnsemble Model Performance on Test Set:")
print(classification_report(y_test, y_test_pred))

Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Tuning CatBoost...
Best parameters for CatBoost: {'depth': 6, 'iterations': 200, 'learning_rate': 0.1}

Ensemble Model Performance on Test Set:
              precision    recall  f1-score   support

           0       0.65      0.86      0.74        81
           1       0.95      0.85      0.90       248

    accuracy                           0.85       329
   macro avg       0.80      0.86      0.82       329
weighted avg       0.88      0.85      0.86       329

