In [27]:


# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Ignore warnings for clean output
import warnings
warnings.filterwarnings("ignore")

print("Libraries imported successfully!")


Libraries imported successfully!


In [28]:
# Step 2: Load and inspect the data
# Update the file paths if necessary
train_path = 'train.csv'
test_path = 'test.csv'

# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Display basic info about training data
print("Training Data Info:")
print(train_df.info())
print("\nTraining Data Head:")
print(train_df.head())

# Display basic info about test data
print("\nTest Data Info:")
print(test_df.info())
print("\nTest Data Head:")
print(test_df.head())


Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2190 non-null   int64  
 1   day            2190 non-null   int64  
 2   pressure       2190 non-null   float64
 3   maxtemp        2190 non-null   float64
 4   temparature    2190 non-null   float64
 5   mintemp        2190 non-null   float64
 6   dewpoint       2190 non-null   float64
 7   humidity       2190 non-null   float64
 8   cloud          2190 non-null   float64
 9   sunshine       2190 non-null   float64
 10  winddirection  2190 non-null   float64
 11  windspeed      2190 non-null   float64
 12  rainfall       2190 non-null   int64  
dtypes: float64(10), int64(3)
memory usage: 222.6 KB
None

Training Data Head:
   id  day  pressure  maxtemp  temparature  mintemp  dewpoint  humidity  \
0   0    1    1017.4     21.2         20.6     19.9      19

In [29]:
# Step 3: Data Preprocessing

# Save and drop the id columns
train_ids = train_df['id']
test_ids = test_df['id']
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

# Impute missing value in test set for 'winddirection' with the median from training set
median_winddirection = train_df['winddirection'].median()
test_df['winddirection'] = test_df['winddirection'].fillna(median_winddirection)

# Separate target from features in training data
X = train_df.drop('rainfall', axis=1)
y = train_df['rainfall']

# List of features (all columns except target)
features = X.columns.tolist()
print("Features being used:", features)

# Optional: Scaling (StandardScaler)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit scaler on training features and transform both train and test
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

# Convert scaled arrays back to DataFrame for ease of use
X_scaled = pd.DataFrame(X_scaled, columns=features)
test_scaled = pd.DataFrame(test_scaled, columns=features)

print("Data preprocessing completed!")


Features being used: ['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']
Data preprocessing completed!


In [30]:
# Step 4: Baseline Model Training with XGBoost

# Define the XGBClassifier with some basic parameters
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42
)

# Set up cross-validation using StratifiedKFold
from sklearn.model_selection import StratifiedKFold, cross_val_score
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate model using cross-validation with ROC AUC score
cv_scores = cross_val_score(xgb_model, X_scaled, y, cv=skf, scoring='roc_auc')

print("XGBoost ROC AUC Scores for each fold:", cv_scores)
print("Mean ROC AUC Score:", np.mean(cv_scores))


XGBoost ROC AUC Scores for each fold: [0.90945567 0.83964646 0.85531706 0.87864759 0.86930415]
Mean ROC AUC Score: 0.8704741863075197


In [32]:
# Step 5: Hyperparameter tuning for XGBoost (with n_jobs=1 to avoid serialization issues)

from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'gamma': [0, 0.1, 0.5, 1],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

# Initialize a new XGBClassifier
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42
)

# Setup RandomizedSearchCV with n_jobs=1 to avoid parallel processing issues
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=20,          # number of parameter settings sampled
    scoring='roc_auc',
    cv=skf,
    verbose=1,
    random_state=42,
    n_jobs=1
)

# Run hyperparameter tuning
random_search.fit(X_scaled, y)

# Output the best parameters and best score
print("Best parameters found:", random_search.best_params_)
print("Best ROC AUC Score:", random_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
Best ROC AUC Score: 0.8925420875420876


In [34]:
# Step 6: Train Final Model and Create Submission File

# Initialize the final XGBoost model with best parameters
final_xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42,
    subsample=0.8,
    n_estimators=300,
    max_depth=4,
    learning_rate=0.01,
    gamma=0,
    colsample_bytree=0.7
)

# Optional: Evaluate final model with cross-validation
cv_scores_final = cross_val_score(final_xgb, X_scaled, y, cv=skf, scoring='roc_auc')
print("Final model CV ROC AUC Scores:", cv_scores_final)
print("Mean ROC AUC Score:", np.mean(cv_scores_final))

# Train final model on the full training dataset
final_xgb.fit(X_scaled, y)

# Predict probabilities for the positive class (rainfall) on the test set
test_preds = final_xgb.predict_proba(test_scaled)[:, 1]

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'rainfall': test_preds
})

# Save submission to a CSV file (update the path as needed)
submission_path = 'submission.csv'
submission.to_csv(submission_path, index=False)
print("Submission file saved successfully at:", submission_path)


Final model CV ROC AUC Scores: [0.92547699 0.85600449 0.87269921 0.91043771 0.89809203]
Mean ROC AUC Score: 0.8925420875420876
Submission file saved successfully at: submission.csv


In [35]:
# Step 7: Hyperparameter Tuning and Training for LightGBM

from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define a parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [15, 31, 63],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

# Initialize the LightGBM classifier
lgb_model = LGBMClassifier(random_state=42)

# Setup RandomizedSearchCV for LightGBM (using n_jobs=1 to avoid multiprocessing issues)
random_search_lgb = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_grid_lgb,
    n_iter=20,          # number of parameter settings sampled
    scoring='roc_auc',
    cv=skf,
    verbose=1,
    random_state=42,
    n_jobs=1
)

# Run hyperparameter tuning for LightGBM
random_search_lgb.fit(X_scaled, y)

# Output the best parameters and best score for LightGBM
print("Best LightGBM parameters found:", random_search_lgb.best_params_)
print("Best ROC AUC Score for LightGBM:", random_search_lgb.best_score_)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 1320, number of negative: 432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.753425 -> initscore=1.116961
[LightGBM] [Info] Start training from score 1.116961
[LightGBM] [Info] Number of positive: 1320, number of negative: 432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1562
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.753425 -> initscore=1.116961
[LightGBM] [