# Data Preprocessing for Enedis Challenge

This notebook handles the initial data loading and cleaning steps.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Loading datasets

In [None]:
X_train = pd.read_csv('X_train_78VdSWL.csv')
X_test = pd.read_csv('X_test_XKVc4no.csv')
y_train = pd.read_csv('y_train_u0UkKEh.csv')

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Target shape: {y_train.shape}")

X_train.head()

## Setting timestamps as index

In [None]:
for data in [X_train, X_test, y_train]:
    data.set_index(data.columns[0], inplace=True)

## Filling missing values with ground truth

In [None]:
X_train = X_train.fillna(y_train)

na_columns = X_train.columns[X_train.isna().any()]
print(f"Columns still with NaN: {list(na_columns)}")

X_train = X_train.drop(columns=na_columns)

X_train.head()

## Renaming holed series to avoid conflicts

In [None]:
last_cols = X_train.columns[-999:]
mapping = {col: f"{2001+i}" for i, col in enumerate(last_cols)}
X_train = X_train.rename(columns=mapping)

## Splitting X_test into test and training parts

Last 1000 columns are the prediction targets.

In [None]:
test_data = X_test[X_test.columns[-1000:]]
X_test_training = X_test[X_test.columns[:-1000]]

print(f"Test data shape: {test_data.shape}")
print(f"Extra training shape: {X_test_training.shape}")

print("\nTest data (holed series):")
print(test_data.head())

print("\nX_test_training (complete series):")
print(X_test_training.head())

## Removing incomplete series

In [None]:
bad_cols = X_test_training.columns[X_test_training.isna().any()]
print(f"Dropping {len(bad_cols)} incomplete series")
X_test_training = X_test_training.drop(columns=bad_cols)

## Verifying no overlapping IDs

In [None]:
ids_train = set(X_train.columns)
ids_test = set(X_test_training.columns)

common = ids_train & ids_test
if not common:
    print("✓ No common IDs found")
else:
    print(f"⚠ Found {len(common)} overlapping IDs")

## Merging training sets and saving files

In [None]:
merged_train = pd.concat([X_train, X_test_training], axis=1)

print(f"\nMerged training set: {merged_train.shape}")
print(f"Test set: {test_data.shape}")

merged_train.head()

merged_train.to_csv("merged_train.csv")
test_data.to_csv("test_data.csv")

print("\n✓ Files saved successfully")