# Data Preprocessing for Enedis Challenge

This notebook handles the initial data loading and cleaning steps.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


## Loading datasets 

In [None]:
train_df = pd.read_csv('data/X_train_78VdSWL.csv')
test_df = pd.read_csv('data/X_test_XKVc4no.csv')
target_df = pd.read_csv('data/y_train_u0UkKEh.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target shape: {target_df.shape}")

In [None]:
display(train_df.head())
display(test_df.head())
display(target_df.head())

## Setting timestamps as index

In [None]:
for df in [train_df, test_df, target_df]:
    df.set_index(df.columns[0], inplace=True)

## Completing training data with ground truth values

In [None]:
train_df = train_df.fillna(target_df)

na_cols = train_df.columns[train_df.isna().any()]
print(f"Columns still containing NaN: {list(na_cols)}")

train_df = train_df.drop(columns=na_cols)

## Encoding holed series with unique identifiers

In [None]:
holed_series = train_df.columns[-999:]
new_names = {name: f"{2001+i}" for i, name in enumerate(holed_series)}
train_df = train_df.rename(columns=new_names)

## Separating prediction target from additional training data

Last 1000 series are the ones we need to predict.

In [None]:
prediction_target = test_df[test_df.columns[-1000:]]
extra_train = test_df[test_df.columns[:-1000]]

print(f"Target for submission: {prediction_target.shape}")
print(f"Extra training series: {extra_train.shape}")

## Removing corrupted series from extra training data

In [None]:
corrupted = extra_train.columns[extra_train.isna().any()]
print(f"Removing {len(corrupted)} corrupted series")
extra_train = extra_train.drop(columns=corrupted)

## Checking for duplicate series IDs

In [None]:
train_ids = set(train_df.columns)
extra_ids = set(extra_train.columns)

overlap = train_ids.intersection(extra_ids)
if not overlap:
    print("✓ All series have unique identifiers")
else:
    print(f"⚠ Warning: {len(overlap)} duplicate IDs found")

## Consolidating all training data and exporting

In [None]:
complete_train = pd.concat([train_df, extra_train], axis=1)

complete_train.to_csv("merged_train.csv")
prediction_target.to_csv("test_data.csv")

print(f"\nFinal training dataset: {complete_train.shape}")
print(f"Prediction target dataset: {prediction_target.shape}")
print("\n✓ Export completed successfully")

In [None]:
display(complete_train.head())
display(prediction_target.head())