In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
daily_data = pd.read_csv('../artifacts/daily_data.csv')

# Separate the rows with and without missing condition_text
train_data = daily_data.dropna(subset=['condition_text'])
test_data = daily_data[daily_data['condition_text'].isnull()]

# Combine the data to ensure consistent encoding
combined_data = pd.concat([train_data, test_data])

# Encode categorical features
label_encoder = LabelEncoder()
combined_data['city_id'] = label_encoder.fit_transform(combined_data['city_id'])

# Convert sunrise and sunset to total minutes from midnight
def convert_to_minutes(time_str):
    time_parts = time_str.strip().split(' ')
    period = time_parts[1]
    hours, minutes = map(int, time_parts[0].split(':'))
    if period == 'PM' and hours != 12:
        hours += 12
    elif period == 'AM' and hours == 12:
        hours = 0
    return hours * 60 + minutes

combined_data['sunrise'] = combined_data['sunrise'].apply(convert_to_minutes)
combined_data['sunset'] = combined_data['sunset'].apply(convert_to_minutes)

# Feature engineering: Adding new features or interactions
combined_data['temp_range'] = combined_data['sunset'] - combined_data['sunrise']
combined_data['wind_product'] = combined_data['wind_kph'] * combined_data['gust_kph']

# Split the combined data back into train and test sets
train_data = combined_data[combined_data['condition_text'].notna()].copy()
test_data = combined_data[combined_data['condition_text'].isna()].copy()

# Impute missing values (if any) in numerical columns using median
imputer = SimpleImputer(strategy='median')
train_data_imputed = pd.DataFrame(imputer.fit_transform(train_data.drop(columns=['day_id', 'condition_text'])), columns=train_data.columns.drop(['day_id', 'condition_text']))
test_data_imputed = pd.DataFrame(imputer.transform(test_data.drop(columns=['day_id', 'condition_text'])), columns=test_data.columns.drop(['day_id', 'condition_text']))

# Normalize the numerical features
scaler = StandardScaler()
train_data_scaled = pd.DataFrame(scaler.fit_transform(train_data_imputed), columns=train_data_imputed.columns)
test_data_scaled = pd.DataFrame(scaler.transform(test_data_imputed), columns=test_data_imputed.columns)

# Encode the target variable
target_encoder = LabelEncoder()
train_data['condition_text'] = target_encoder.fit_transform(train_data['condition_text'])

# Prepare the final training data
X_train = train_data_scaled
y_train = train_data['condition_text']

# Define the models with a larger set of hyperparameters for tuning
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()
xgb_model = XGBClassifier()

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search_rf = GridSearchCV(rf_model, param_grid_rf, refit=True, verbose=1, n_jobs=-1, cv=5)
grid_search_gb = GridSearchCV(gb_model, param_grid_gb, refit=True, verbose=1, n_jobs=-1, cv=5)
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, refit=True, verbose=1, n_jobs=-1, cv=5)

# Train the models
grid_search_rf.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)
grid_search_xgb.fit(X_train, y_train)

# Best parameters from GridSearchCV
print(f"Best Parameters RF: {grid_search_rf.best_params_}")
print(f"Best Parameters GB: {grid_search_gb.best_params_}")
print(f"Best Parameters XGB: {grid_search_xgb.best_params_}")

# Train the final models with the best parameters
best_rf_model = grid_search_rf.best_estimator_
best_gb_model = grid_search_gb.best_estimator_
best_xgb_model = grid_search_xgb.best_estimator_

# Create an ensemble model
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf_model),
    ('gb', best_gb_model),
    ('xgb', best_xgb_model)
], voting='hard')

voting_clf.fit(X_train, y_train)

# Predict the missing condition_text values
test_data['condition_text'] = target_encoder.inverse_transform(voting_clf.predict(test_data_scaled))

# Ensure the predicted values are filled back into the original dataset
daily_data.loc[daily_data['condition_text'].isnull(), 'condition_text'] = test_data['condition_text'].values

# Create the submission dataframe with all day_id and condition_text
submission_df = daily_data[['day_id', 'condition_text']]

# Save the submission dataframe in the current working directory
submission_df.to_csv('../artifacts/submission.csv', index=False)
print("Submission file saved as submission.csv")