In [1]:
# -----------------------------
# TESTING DATA PREPROCESSING
# -----------------------------
# ---------------------------
# STEP 1: IMPORT LIBRARIES
# ---------------------------
import pandas as pd
import os

# ---------------------------
# STEP 2: DEFINE PATHS
# ---------------------------
test_folder = "../data/raw/testing/"
output_path = "../data/cleaned/"
os.makedirs(output_path, exist_ok=True)  # create if not exist

save_file = os.path.join(output_path, "concatenated_testing_cleaned.csv")

# ---------------------------
# STEP 3: LIST ALL CSV FILES IN TEST FOLDER
# ---------------------------
test_files = [f for f in os.listdir(test_folder) if f.endswith(".csv")]
print("Test files found:", test_files)

# ---------------------------
# STEP 4: LOAD AND CONCATENATE ALL TEST FILES
# ---------------------------
df_list = []

for file in test_files:
    file_path = os.path.join(test_folder, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

df_test = pd.concat(df_list, ignore_index=True)
print("Shape of merged test dataset:", df_test.shape)

# ---------------------------
# STEP 5: CHECK MISSING VALUES & DUPLICATES
# ---------------------------
print("\nMissing values per column:")
print(df_test.isna().sum())

duplicate_count = df_test.duplicated().sum()
print("\nTotal duplicates:", duplicate_count)

# ---------------------------
# STEP 6: CONVERT DATETIME COLUMN
# ---------------------------
df_test['datetime'] = pd.to_datetime(df_test['datetime'], errors='coerce')

# ---------------------------
# STEP 7: EXTRACT TIME-BASED FEATURES
# ---------------------------
df_test['year'] = df_test['datetime'].dt.year
df_test['month'] = df_test['datetime'].dt.month
df_test['day'] = df_test['datetime'].dt.day
df_test['hour'] = df_test['datetime'].dt.hour
df_test['day_of_week'] = df_test['datetime'].dt.dayofweek
df_test['is_weekend'] = df_test['day_of_week'].isin([5,6]).astype(int)

# Optional: seasons
def get_season(month):
    if month in [12,1,2]:
        return "winter"
    elif month in [3,4,5]:
        return "spring"
    elif month in [6,7,8]:
        return "summer"
    else:
        return "autumn"

df_test['season'] = df_test['month'].apply(get_season)

# ---------------------------
# STEP 8: SAVE CLEANED TEST DATA
# ---------------------------
df_test.to_csv(save_file, index=False)
print("\nCleaned testing dataset saved at:", save_file)


Test files found: ['islamabad_complete_data_july_to_dec_2024.csv', 'karachi_complete_data_july_to_dec_2024.csv', 'lahore_complete_data_july_to_dec_2024.csv', 'peshawar_complete_data_july_to_dec_2024.csv', 'quetta_complete_data_july_to_dec_2024.csv']
Shape of merged test dataset: (21792, 18)

Missing values per column:
datetime                0
main_aqi                0
components_co           0
components_no           0
components_no2          0
components_o3           0
components_so2          0
components_pm2_5        0
components_pm10         0
components_nh3          0
temperature_2m          0
relative_humidity_2m    0
dew_point_2m            0
precipitation           0
surface_pressure        0
wind_speed_10m          0
wind_direction_10m      0
shortwave_radiation     0
dtype: int64

Total duplicates: 0

Cleaned testing dataset saved at: ../data/cleaned/concatenated_testing_cleaned.csv


In [2]:
df_test.head()

Unnamed: 0,datetime,main_aqi,components_co,components_no,components_no2,components_o3,components_so2,components_pm2_5,components_pm10,components_nh3,...,wind_speed_10m,wind_direction_10m,shortwave_radiation,year,month,day,hour,day_of_week,is_weekend,season
0,2024-01-07 00:00:00,3,747.68,0.0,7.63,98.71,5.78,48.03,53.2,8.99,...,15.379206,106.31393,0,2024.0,1.0,7.0,0.0,6.0,1,winter
1,2024-01-07 01:00:00,3,801.09,0.01,9.94,91.55,5.3,48.77,53.33,8.87,...,7.69592,100.784256,18,2024.0,1.0,7.0,1.0,6.0,1,winter
2,2024-01-07 02:00:00,3,988.01,0.48,20.39,79.39,6.38,49.33,53.77,10.13,...,8.39657,120.96369,107,2024.0,1.0,7.0,2.0,6.0,1,winter
3,2024-01-07 03:00:00,3,1295.09,2.88,33.24,77.25,9.18,49.95,54.29,11.27,...,5.771239,93.57626,275,2024.0,1.0,7.0,3.0,6.0,1,winter
4,2024-01-07 04:00:00,3,1121.52,2.43,23.99,120.16,15.14,42.27,46.51,9.25,...,3.096837,234.46223,446,2024.0,1.0,7.0,4.0,6.0,1,winter


In [3]:
# ---------------------------
# Data Preparation for Modeling
# ---------------------------




# STEP 1: IMPORT LIBRARIES
# ---------------------------
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ---------------------------
# STEP 2: LOAD CLEANED TRAINING & TESTING DATA
# ---------------------------
train_file = "../data/cleaned/concatenated_training_cleaned.csv"
test_file = "../data/cleaned/concatenated_testing_cleaned.csv"

df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

print("Training data shape:", df_train.shape)
print("Testing data shape:", df_test.shape)

# ---------------------------
# STEP 3: FEATURE & TARGET SEPARATION
# ---------------------------
target_col = "main_aqi"

# Drop 'datetime' from features
feature_cols = [col for col in df_train.columns if col not in [target_col, 'datetime']]

X_train = df_train[feature_cols]
y_train = df_train[target_col]

X_test = df_test[feature_cols]
y_test = df_test[target_col]

print("Features shape:", X_train.shape, "Target shape:", y_train.shape)

# ---------------------------
# STEP 4: ENCODING CATEGORICAL FEATURES
# ---------------------------
# Identify categorical columns
categorical_cols = ['season']  # currently only 'season'
numeric_cols = [col for col in feature_cols if col not in categorical_cols]

# Column transformer: one-hot encode categorical, scale numeric
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)  # drop first to avoid dummy trap
    ]
)

# Fit and transform training data
X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

print("Prepared X_train shape:", X_train_prepared.shape)
print("Prepared X_test shape:", X_test_prepared.shape)

# ---------------------------
# STEP 5: SAVE PREPARED ARRAYS (OPTIONAL)
# ---------------------------
# You can save the preprocessor for later use in model training or GUI
import joblib
joblib.dump(preprocessor, "../data/models/preprocessor.pkl")
print("Preprocessor saved at: ../data/models/preprocessor.pkl")

Training data shape: (123134, 25)
Testing data shape: (21792, 25)
Features shape: (123134, 23) Target shape: (123134,)
Prepared X_train shape: (123134, 25)
Prepared X_test shape: (21792, 25)
Preprocessor saved at: ../data/models/preprocessor.pkl


In [5]:
#Encoding Season Column

# STEP: LOAD CLEANED DATA
import pandas as pd
import os

train_file = "../data/cleaned/concatenated_training_cleaned.csv"
test_file = "../data/cleaned/concatenated_testing_cleaned.csv"

df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

# STEP: ENCODE 'season' COLUMN

season_mapping = {'spring': 0, 'summer': 1, 'autumn': 2, 'winter': 3}

df_train['season'] = df_train['season'].map(season_mapping)
df_test['season'] = df_test['season'].map(season_mapping)

# STEP: SAVE UPDATED FILES

df_train.to_csv("../data/cleaned/concatenated_training_cleaned.csv", index=False)
df_test.to_csv("../data/cleaned/concatenated_testing_cleaned.csv", index=False)

print("Season column encoded and files saved successfully in cleaned folder!")


Season column encoded and files saved successfully in cleaned folder!
