In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

INPUT_FILE = 'Full.xlsx'
TARGET_SHEET_NAME = 'merged'
OUTPUT_FILE = 'Final_Borneo_Dataset.csv'

REGION_COL_NAME = 'region'

print(f"Reading Excel file: {INPUT_FILE} from sheet: {TARGET_SHEET_NAME}...")

try:
    df = pd.read_excel(INPUT_FILE, sheet_name=TARGET_SHEET_NAME)
except FileNotFoundError:
    print(" Error: Input file not found! Please check the file name.")
    sys.exit()
except ValueError:
    print(f" Error: Sheet name '{TARGET_SHEET_NAME}' not found. Please check your Excel file.")
    sys.exit()
except Exception as e:
    print(f" Error reading Excel file: {e}")
    sys.exit()

initial_rows = len(df)
print(f"Original dataset rows: {initial_rows}")


TARGET_COL = 'frp'
weather_cols = [col for col in df.columns if 'Day_' in col]
print(f"   Identified {len(weather_cols)} weather features.")

core_cols = ['latitude', 'longitude', 'acq_date', 'acq_time', TARGET_COL]

df_deduped = df.drop_duplicates(subset=core_cols, keep='first')
deleted_rows = initial_rows - len(df_deduped)
print(f" Precision Deduplication: Removed {deleted_rows} exact duplicate rows.")

df_final_clean = df_deduped.dropna(subset=weather_cols + [TARGET_COL])
print(f"   Removed {len(df_deduped) - len(df_final_clean)} rows containing NaN values.")
print(f"   Final rows for processing: {len(df_final_clean)}")

df_time_date = df_final_clean[['acq_time', 'acq_date']].reset_index(drop=True)

df_weather = df_final_clean[weather_cols].reset_index(drop=True)

CORE_FEATURES_BASE = ['latitude', 'longitude', TARGET_COL, REGION_COL_NAME]
df_core = df_final_clean[CORE_FEATURES_BASE].reset_index(drop=True)

print(" Extracting temporal features (Month, Hour)...")

df_time_date['acq_date'] = pd.to_datetime(df_time_date['acq_date'])

df_core['Month'] = df_time_date['acq_date'].dt.month

if 'acq_time' in df_time_date.columns:
    df_core['Hour'] = df_time_date['acq_time'].astype(str).str.zfill(4).str[:2].astype(int)

if REGION_COL_NAME in df_core.columns:
    print(f"  One-Hot Encoding '{REGION_COL_NAME}'...")
    df_region_encoded = pd.get_dummies(df_core[REGION_COL_NAME], prefix=REGION_COL_NAME, drop_first=False)
    df_core = pd.concat([df_core.drop(columns=[REGION_COL_NAME]), df_region_encoded], axis=1)
else:
    print(f" Warning: Region column '{REGION_COL_NAME}' not found. Skipping encoding.")

print("ðŸ“ˆ Standardizing and running PCA on 48 weather features...")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_weather)

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
n_components = X_pca.shape[1]

print(f" PCA Successful! Reduced 48 columns to {n_components} components.")
print(f"   Retained {pca.explained_variance_ratio_.sum()*100:.2f}% of original information.")


pca_cols = [f'Weather_PC{i+1}' for i in range(n_components)]
df_pca = pd.DataFrame(X_pca, columns=pca_cols)

# Merge everything: Core + Region + Time + PCA Weather
df_final = pd.concat([df_core, df_pca], axis=1)

# Save to CSV
df_final.to_csv(OUTPUT_FILE, index=False)

print("\n" + "="*60)
print(f"File saved to: {OUTPUT_FILE}")
print(f"Final Data Shape: {df_final.shape}")
print("-" * 30)
print("="*60)