In [21]:
'''
Combine Train & Test
Sort the data - based on dates

Clustering Algo
Get the latitude and longitude - (stored in radian --needs to be converted)
Apply clustering alogrithm -- create 3 clusters 

Create 3 data splites - based on cluster ID 

Now processing for each data separately 
Fill the missing values 





'''

'\nCombine Train & Test\nSort the data - based on dates\n\nClustering Algo\nGet the latitude and longitude - (stored in radian --needs to be converted)\nApply clustering alogrithm -- create 3 clusters \n\nCreate 3 data splites - based on cluster ID \n\nNow processing for each data separately \nFill the missing values \n\n\n\n\n\n'

In [22]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")

In [23]:
# Combine Train and Test datasets
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Convert the 'Date' column to datetime format for sorting
combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')

# Sort the data based on dates
combined_df.sort_values(by='Date', inplace=True)

# Reset index after sorting
combined_df.reset_index(drop=True, inplace=True)

combined_df['LAT_rad'] = np.radians(combined_df['LAT'])
combined_df['LON_rad'] = np.radians(combined_df['LON'])


from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
combined_df['Cluster_ID'] = kmeans.fit_predict(combined_df[['LAT_rad', 'LON_rad']])

# Split the data into three subsets based on the cluster IDs
cluster_0_df = combined_df[combined_df['Cluster_ID'] == 0].copy()
cluster_1_df = combined_df[combined_df['Cluster_ID'] == 1].copy()
cluster_2_df = combined_df[combined_df['Cluster_ID'] == 2].copy()

  combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')


In [24]:
cluster_0_df.shape

(18632, 17)

In [25]:
cluster_1_df.shape

(52608, 17)

In [26]:
cluster_2_df.shape

(21920, 17)

In [27]:
from sklearn.preprocessing import MinMaxScaler
from scipy.interpolate import splrep, splev
from statsmodels.tsa.seasonal import STL
import numpy as np
import pandas as pd

class DataTransformer:
    def __init__(self, df):
        self.df = df.copy()
        
    def scale_and_interpolate(self, features):
        for feature in features:
            scaler = MinMaxScaler()
            self.df[f'{feature}_scaled'] = scaler.fit_transform(self.df[[feature]])

            mask_finite = np.isfinite(self.df[f'{feature}_scaled'].values)
            known_indices = self.df.index.values[mask_finite]
            known_values = self.df[f'{feature}_scaled'].values[mask_finite]
            missing_indices = self.df.index.values[~mask_finite]

            self.df[f'{feature}_bspline'] = self.df[f'{feature}_scaled']
            tck = splrep(known_indices, known_values)
            self.df.loc[~mask_finite, f'{feature}_bspline'] = splev(missing_indices, tck)
            self.df[feature] = scaler.inverse_transform(self.df[[f'{feature}_bspline']])

    def add_moving_stats(self, window_size, features):
        for feature in features:
            self.df[f'{feature}_ma_{window_size}'] = self.df[f'{feature}_bspline'].rolling(window=window_size).mean()
            self.df[f'{feature}_std_{window_size}'] = self.df[f'{feature}_bspline'].rolling(window=window_size).std()

    def add_monthly_averages(self, features):
        self.df['Date'] = pd.to_datetime(self.df['Date'], errors='coerce', dayfirst=True)
        self.df['month'] = self.df['Date'].dt.month
        for feature in features:
            monthly_avg = self.df.groupby('month')[f'{feature}_bspline'].mean()
            self.df[f'monthly_avg_{feature}'] = self.df['month'].map(monthly_avg)

    def add_seasonal_decomposition(self, features, periods):
        for feature, period in zip(features, periods):
            stl = STL(self.df[f'{feature}_bspline'], period=period)
            result = stl.fit()
            self.df[f'{feature}_trend'] = result.trend
            self.df[f'{feature}_seasonal'] = result.seasonal
            self.df[f'{feature}_residual'] = result.resid

    def add_fourier_transform(self, periods, n_harmonics, columns):
        for col, period, harmonics in zip(columns, periods, n_harmonics):
            t = (self.df[col] - self.df[col].min()).dt.days.values if pd.api.types.is_datetime64_any_dtype(self.df[col]) else self.df[col].values
            for k in range(1, harmonics + 1):
                self.df[f'{col}_sin_{k}'] = np.sin(2 * np.pi * k * t / period)
                self.df[f'{col}_cos_{k}'] = np.cos(2 * np.pi * k * t / period)

    def transform(self, features, window_size, fourier_periods, fourier_harmonics, seasonal_periods):
        self.scale_and_interpolate(features)
        self.add_moving_stats(window_size, features)
        self.add_monthly_averages(features)
        self.add_seasonal_decomposition(features, seasonal_periods)
        self.add_fourier_transform(fourier_periods, fourier_harmonics, features)
        return self.df


In [28]:
features = ['LST', 'AAI', 'CloudFraction', 'NO2_strat', 'NO2_total', 'NO2_trop', 'TropopausePressure']
transformer_df_cluster_0 = DataTransformer(cluster_0_df)
transformed_df_cluster_0 = transformer_df_cluster_0.transform(
    features=features, 
    window_size=7, 
    fourier_periods=[365.25, 182, 182, 91], 
    fourier_harmonics=[4, 3, 3, 3],
    seasonal_periods=[365, 182, 182, 91]
)

In [29]:
features = ['LST', 'AAI', 'CloudFraction', 'NO2_strat', 'NO2_total', 'NO2_trop', 'TropopausePressure']
transformer_df_cluster_1 = DataTransformer(cluster_1_df)
transformed_df_cluster_1 = transformer_df_cluster_1.transform(
    features=features, 
    window_size=7, 
    fourier_periods=[365.25, 182, 182, 91], 
    fourier_harmonics=[4, 3, 3, 3],
    seasonal_periods=[365, 182, 182, 91]
)

In [30]:
features = ['LST', 'AAI', 'CloudFraction', 'NO2_strat', 'NO2_total', 'NO2_trop', 'TropopausePressure']
transformer_df_cluster_2 = DataTransformer(cluster_2_df)
transformed_df_cluster_2 = transformer_df_cluster_2.transform(
    features=features, 
    window_size=7, 
    fourier_periods=[365.25, 182, 182, 91], 
    fourier_harmonics=[4, 3, 3, 3],
    seasonal_periods=[365, 182, 182, 91]
)

In [31]:
transformed_combined_df = pd.concat([transformed_df_cluster_0, transformed_df_cluster_1, transformed_df_cluster_2], ignore_index=True)


In [39]:
train_transformed_df = transformed_combined_df[transformed_combined_df['ID_Zindi'].isin(train_df['ID_Zindi'])].copy()
test_transformed_df = transformed_combined_df[transformed_combined_df['ID_Zindi'].isin(test_df['ID_Zindi'])].copy()

# Step 2: Reset index for each DataFrame if needed
train_transformed_df.reset_index(drop=True, inplace=True)
test_transformed_df.reset_index(drop=True, inplace=True)

In [40]:
train_transformed_df.shape

(86584, 91)

In [41]:
test_transformed_df.shape

(6576, 91)

In [42]:
train_transformed_df.to_csv("New_trans_train.csv")
test_transformed_df.to_csv("New_trans_test.csv")

In [43]:
train_cluster_0_df = train_transformed_df[train_transformed_df['Cluster_ID'] == 0].copy()

In [44]:
train_cluster_0_df.shape

(16440, 91)

In [45]:
train_cluster_0_df.to_csv("Cluster_0_train.csv")