In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LassoCV
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

# Set pandas display option to show full column width
pd.set_option('display.max_colwidth', None)

# ---------------------------------------------
# Load and preprocess the training data
# ---------------------------------------------

# Load training data
train_df = pd.read_csv('data/Train.csv')

# Convert 'Date' to datetime
train_df['Date'] = pd.to_datetime(train_df['Date'], errors='coerce')

# Replace infinite values with NaN
train_df = train_df.replace([np.inf, -np.inf], np.nan)

# Sort the DataFrame by 'Date'
train_df = train_df.sort_values('Date')

# Forward fill and backward fill NaN values
train_df.fillna(method='ffill', inplace=True)
train_df.fillna(method='bfill', inplace=True)

# Drop any remaining NaN values
train_df = train_df.dropna()

# Replace infinite values with NaN again
train_df = train_df.replace([np.inf, -np.inf], np.nan)

# Ensure 'GT_NO2' (target) has no NaNs
train_df = train_df.dropna(subset=['GT_NO2'])

# Define target variable and features to exclude
target = 'GT_NO2'
exclude_columns = ['ID_Zindi', 'Date', 'ID', target]

# Get list of original features
original_features = [col for col in train_df.columns if col not in exclude_columns]

# Extract cyclical features from 'Date'
train_df['day_of_year'] = train_df['Date'].dt.dayofyear
train_df['day_sin'] = np.sin(2 * np.pi * train_df['day_of_year'] / 365.25)
train_df['day_cos'] = np.cos(2 * np.pi * train_df['day_of_year'] / 365.25)

# Add cyclical features to original features
original_features.extend(['day_sin', 'day_cos'])

# Define transformations
def identity(x):
    return x

def log_transform(x):
    return np.log1p(x.clip(lower=0))

def sqrt_transform(x):
    return np.sqrt(x.clip(lower=0))

def square_transform(x):
    return np.square(x)

def reciprocal_transform(x):
    return 1 / (x + 1e-6)  # Add small constant to avoid division by zero

transformations = [
    ('identity', identity),
    ('log', log_transform),
    ('sqrt', sqrt_transform),
    ('square', square_transform),
    ('reciprocal', reciprocal_transform),
]

# Apply transformations to each feature in training data
transformed_feature_names = []
for feature in original_features:
    for name, func in transformations:
        transformed_feature_name = f"{feature}_{name}"
        try:
            transformed_values = func(train_df[feature])
            if np.isfinite(transformed_values).all():
                train_df[transformed_feature_name] = transformed_values
                transformed_feature_names.append(transformed_feature_name)
            else:
                print(f"Transformation {name} resulted in non-finite values for feature {feature}. Skipping.")
        except Exception as e:
            print(f"Could not transform {feature} using {name}: {e}")

# Handle polynomial features (degree 2) including interactions
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(train_df[original_features])
poly_feature_names = poly.get_feature_names_out(original_features)

# Create DataFrame for polynomial features
df_poly = pd.DataFrame(X_poly, columns=poly_feature_names)

# Remove the original features to avoid duplication
poly_feature_names = [name for name in poly_feature_names if name not in original_features]

# Concatenate polynomial features with the DataFrame
train_df = pd.concat([train_df.reset_index(drop=True), df_poly[poly_feature_names].reset_index(drop=True)], axis=1)

# Add polynomial feature names to transformed_feature_names
transformed_feature_names.extend(poly_feature_names)

# Prepare X and y
X_train = train_df[transformed_feature_names]
y_train = train_df[target]

# Replace infinite values with NaN
X_train = X_train.replace([np.inf, -np.inf], np.nan)

# Remove columns with constant values
X_train = X_train.loc[:, X_train.apply(pd.Series.nunique) != 1]

# Ensure that y aligns with X_train
y_train = y_train.loc[X_train.index]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Feature selection using LassoCV
lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso.fit(X_train_scaled, y_train)

# Get selected features
coef = pd.Series(lasso.coef_, index=X_train.columns)
selected_features = coef[coef != 0].index.tolist()

print(f"Selected {len(selected_features)} features using LassoCV")

# Prepare data with selected features
X_train_selected = X_train[selected_features]

# Standardize selected features
X_train_selected_scaled = scaler.fit_transform(X_train_selected)

# Train the XGBoost model on the entire training set
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

xgb_model.fit(X_train_selected_scaled, y_train)

# ---------------------------------------------
# Process the test data
# ---------------------------------------------

# Load test data
test_df = pd.read_csv('data/Test.csv')

# Convert 'Date' to datetime
test_df['Date'] = pd.to_datetime(test_df['Date'], errors='coerce')

# Replace infinite values with NaN
test_df = test_df.replace([np.inf, -np.inf], np.nan)

# Sort the DataFrame by 'Date'
test_df = test_df.sort_values('Date')

# Forward fill and backward fill NaN values
test_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='bfill', inplace=True)

# Drop any remaining NaN values
test_df = test_df.dropna()

# Replace infinite values with NaN again
test_df = test_df.replace([np.inf, -np.inf], np.nan)

# Extract cyclical features from 'Date'
test_df['day_of_year'] = test_df['Date'].dt.dayofyear
test_df['day_sin'] = np.sin(2 * np.pi * test_df['day_of_year'] / 365.25)
test_df['day_cos'] = np.cos(2 * np.pi * test_df['day_of_year'] / 365.25)

# Apply the same transformations to test data
for feature in original_features:
    for name, func in transformations:
        transformed_feature_name = f"{feature}_{name}"
        if transformed_feature_name in selected_features:
            try:
                transformed_values = func(test_df[feature])
                if np.isfinite(transformed_values).all():
                    test_df[transformed_feature_name] = transformed_values
                else:
                    print(f"Transformation {name} resulted in non-finite values for feature {feature}. Skipping.")
            except Exception as e:
                print(f"Could not transform {feature} using {name}: {e}")

# Generate polynomial features for test data
X_test_poly = poly.transform(test_df[original_features])
df_test_poly = pd.DataFrame(X_test_poly, columns=poly.get_feature_names_out(original_features))

# Concatenate polynomial features with the test DataFrame
test_df = pd.concat([test_df.reset_index(drop=True), df_test_poly[poly_feature_names].reset_index(drop=True)], axis=1)

# Prepare test data with selected features
X_test = test_df[selected_features]

# Replace infinite values with NaN
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Remove columns with constant values (if any)
X_test = X_test.loc[:, X_test.apply(pd.Series.nunique) != 1]

# Standardize features using the scaler fitted on training data
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test data
test_predictions = xgb_model.predict(X_test_scaled)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'ID_Zindi': test_df['ID_Zindi'],
    'GT_NO2_Predicted': test_predictions
})

# Display the first few rows of the submission
print("\nPredictions on Test Data:")
print(submission.head())

# Optionally, save the predictions to a CSV file
submission.to_csv('predictions.csv', index=False)