In [1]:
import pandas as pd
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')


data = pd.read_csv('data/train.csv')

# Display the first few rows of the dataset
data.head()



In [None]:
def preprocess_data(df):
    """
    Process the dataframe by handling missing values and creating features
    
    Parameters:
    -----------
    df : pandas DataFrame
        The input data
    is_training : bool
        Whether this is training data (True) or test data (False)
    reference_data : dict, optional
        Dictionary containing medians and other statistics from training data
        
    Returns:
    --------
    processed_df : pandas DataFrame
        The processed dataframe
    stats : dict
        Statistics from the data (only for training data)
    """
    # Make a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Handle missing values
    # In training mode, calculate and store medians
    mean = processed_df.mean()
    processed_df = processed_df.fillna(mean)

    
    # ---- FEATURE ENGINEERING ----
    
    # 1. Convert day of year to cyclical month feature
    processed_df['month'] = processed_df['day'].apply(lambda x: datetime.datetime.strptime(str(x), '%j').month)
    processed_df['month_sin'] = np.sin(2 * np.pi * processed_df['month'] / 12)
    processed_df['month_cos'] = np.cos(2 * np.pi * processed_df['month'] / 12)
    
    # 2. Add season indicator (meteorological seasons)
    def get_season(month):
        if month in [12, 1, 2]:
            return 0  # Winter
        elif month in [3, 4, 5]:
            return 1  # Spring
        elif month in [6, 7, 8]:
            return 2  # Summer
        else:
            return 3  # Fall
    
    processed_df['season'] = processed_df['month'].apply(get_season)
    processed_df = pd.get_dummies(processed_df, columns=['season'], prefix='season')
    
    # Ensure all season columns exist
    for col in ['season_0', 'season_1', 'season_2', 'season_3']:
        if col not in processed_df.columns:
            processed_df[col] = 0
    
    # 3. Temperature-related features
    processed_df['temp_range'] = processed_df['maxtemp'] - processed_df['mintemp']
    processed_df['temp_deviation'] = processed_df['temparature'] - ((processed_df['maxtemp'] + processed_df['mintemp']) / 2)
    
    # 4. Humidity and pressure interactions (avoid division by zero)
    processed_df['humidity_pressure_ratio'] = processed_df['humidity'] / processed_df['pressure'].replace(0, 0.001)
    processed_df['dewpoint_diff'] = processed_df['temparature'] - processed_df['dewpoint']
    
    # 5. Wind features
    processed_df['is_windy'] = (processed_df['windspeed'] > 30).astype(int)
    processed_df['wind_chill'] = 13.12 + 0.6215*processed_df['temparature'] - 11.37*(processed_df['windspeed']**0.16 + 0.001) + 0.3965*processed_df['temparature']*(processed_df['windspeed']**0.16 + 0.001)
    processed_df['wind_direction_rad'] = np.radians(processed_df['winddirection'])
    processed_df['wind_x'] = processed_df['windspeed'] * np.cos(processed_df['wind_direction_rad'])
    processed_df['wind_y'] = processed_df['windspeed'] * np.sin(processed_df['wind_direction_rad'])
    
    # 6. Create cloud-humidity interaction
    processed_df['cloud_humidity_product'] = processed_df['cloud'] * processed_df['humidity'] / 100
    
    # Sort by day for proper time-series handling
    processed_df = processed_df.sort_values('day')
    
    # 8. Day of year cyclical features
    processed_df['day_sin'] = np.sin(2 * np.pi * processed_df['day'] / 365)
    processed_df['day_cos'] = np.cos(2 * np.pi * processed_df['day'] / 365)
    
    # 9. Add interaction terms for common weather patterns
    processed_df['temp_humidity'] = processed_df['temparature'] * processed_df['humidity']
    processed_df['wind_temp'] = processed_df['windspeed'] * processed_df['temparature']
    
    return processed_df

In [None]:
# Drop columns 'id' and 'day', and set 'rainfall' as the label column
processed_data = preprocess_data(data)
X = processed_data.drop(columns=['id', 'day', 'rainfall'])
y = data['rainfall']


In [5]:
from sklearn.model_selection import train_test_split

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")



In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.combine import SMOTEENN
from sklearn.metrics import roc_auc_score

# Handle class imbalance
# Combination of over- and under-sampling using SMOTEENN


smoteenn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smoteenn.fit_resample(X_train, y_train)

# Define models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

# Train and validate each model
for name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_test_pred_proba)
    print(f"Model: {name}")
    print(f"AUC-ROC Score: {auc_score:.4f}")
    print("-" * 50)




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report


# Perform hyperparameter tuning for the top 3 models
tuned_models = {}
param_grids = {
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5],
    },
    "Gradient Boosting": {
        "n_estimators": [100, 200],
        "learning_rate": [0.1, 0.01],
        "max_depth": [3, 5],
    },
}

for name, model in list(models.items()):
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(model, param_grids[name], scoring="f1_weighted", cv=3, n_jobs=-1)
    grid_search.fit(X_train_resampled, y_train_resampled)
    tuned_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")





In [61]:
# Create an ensemble model using the tuned models

ensemble_model = VotingClassifier(
    estimators=list(tuned_models.items()),
    voting="soft"
)
ensemble_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Test the ensemble model on the test set
y_test_pred = ensemble_model.predict(X_test)
# Calculate and display the AUC-ROC score for the ensemble model
y_test_pred_proba = ensemble_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_test_pred_proba)
print(f"\nEnsemble Model AUC-ROC Score on Test Set: {auc_score:.4f}")



In [None]:
# Read the test data
test_data = pd.read_csv('data/test.csv')  # Ensure CELL INDEX 0 is executed before this cell
test_data['winddirection'].fillna(test_data['winddirection'].mean(), inplace=True)

# Extract the 'id' column for the submission file
test_ids = test_data['id']

# Drop unnecessary columns to match the training features
# Ensure the columns 'id' and 'day' exist before dropping them
if 'id' in test_data.columns and 'day' in test_data.columns:
    X_test_final = test_data.drop(columns=['id', 'day'])
else:
    raise ValueError("The columns 'id' and 'day' are missing from the test dataset.")

# Predict the probability of rainfall using the ensemble model
rainfall_probabilities = ensemble_model.predict_proba(X_test_final)[:, 1]

# Create the submission dataframe
submission = pd.DataFrame({
    'id': test_ids,
    'rainfall': rainfall_probabilities
})

# Write the submission to a CSV file
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' has been created.")

