# Utility functions

In [1]:
import pandas as pd
import numpy as np

## Utility: Encoder for categorical data

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

class Encoder:
    '''
        A class to wrap up encoding functions.
    '''
    def __init__(
            self,
            categorical_features: list[str],
            categorical_encoding: str = "labelEncoder"
    ) -> None:
        self.categorical_features = categorical_features
        self.categorical_encoding = categorical_encoding

        if self.categorical_encoding not in ["labelEncoder", "oneHot"]:
            print("Categorical encoder not recognized. Please use 'labelEncoder' or 'oneHot'.")
            print("'labelEncoder' will be used by default.")
            self.categorical_encoding = "labelEncoder"

    def encode(
            self,
            df: pd.DataFrame
    ) -> pd.DataFrame:
        '''
            Encode given categorical features.

            Parameters
            ----------
            categorical_encoding: supports keywords "labelEncoder" and "oneHot"
        '''
        if self.categorical_encoding == 'labelEncoder':
            return self.encode_labelEncoder(df)
        elif self.categorical_encoding == 'oneHot':
            return self.encode_oneHot(df)
        else:
            print("WTF bro")

    def encode_labelEncoder(
            self,
            df: pd.DataFrame
    ) -> pd.DataFrame:
        for feature in self.categorical_features:
            df[feature] = LabelEncoder().fit_transform(df[feature])
        return df

    def encode_oneHot(
            self,
            df: pd.DataFrame
    ) -> pd.DataFrame:
        df_cat = pd.DataFrame()
        for feature in self.categorical_features:
            one_hot_encoding = OneHotEncoder().fit_transform(df[[feature]]).toarray()
            for j in range(one_hot_encoding.shape[1]):
                df_cat[feature + f'_{j}'] = one_hot_encoding[:, j]
        df = df.drop(self.categorical_features, axis=1).join(df_cat)
        return df

## Utility: Data cleaning (date conversion, drop and fill nan)

In [3]:
from datetime import datetime

# Date conversion
def days_since_start_of_2020(datetime_series):
    # Convertir la date en un objet datetime
    parsed_date = pd.to_datetime(datetime_series)
    
    # Fixer la date de référence (1er janvier 2022)
    ref_date = datetime(2020, 1, 1)
    
    # Calculer le delta entre la date et le 1er janvier 2022
    delta = parsed_date - ref_date
    
    # Retourner le nombre de jours écoulés
    return delta.dt.days

# For insee data especially
def force_to_numeric(df: pd.DataFrame, features: list[str]) -> pd.DataFrame:
    for feature in features:
        df[feature] = pd.to_numeric(df[feature], errors='coerce')
    return df

# Fill NaN fields with median values (default)
def drop_fill_na(df: pd.DataFrame, mode: str = 'median') -> pd.DataFrame:
    df = df.dropna(axis=1, how='all')   # Drop all features that are 'empty'
    if mode == 'median':
        df = df.fillna(df.median())
        return df
    elif mode == 'mean':
        df = df.fillna(df.mean())
        return df
    else:
        print("Unknown mode")

## Utility: Split dataframes

In [4]:
from sklearn.model_selection import train_test_split

def split(
        df_features: pd.DataFrame,
        df_labels: pd.DataFrame,
        test_size: float = 0.2,
        shuffle: bool = True,
        random_state: int = 42
) -> tuple[np.ndarray]:
    '''
        Returns
        -------
        X_train
        X_val
        y_train
        y_val
    '''
    X = df_features.to_numpy()
    y = df_labels.to_numpy()
    return train_test_split(X, y, test_size=test_size, shuffle=shuffle, random_state=random_state)

## Utility: Filter rows

In [5]:
def filter_summers(
        df: pd.DataFrame
) -> pd.DataFrame:
    # Making sure that column is DateTime
    df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])
    # Boolean selection dataframe
    mask = df['piezo_measurement_date'].dt.month >= 6 & df['piezo_measurement_date'].dt.month <= 9
    # Filtering
    df_filtered = df.loc[mask]
    return df_filtered

## Utility: Load functions for training

In [6]:
# Utility functions

def load_train_valid_from_tiny_dataset(
        data_file: str,
        n_rows: int,
        selected_features: list[str],
        date_feature: str,
        categorical_features: list[str],
        categorical_encoding: str = "labelEncoder",
        numerical_features_to_force: list[str] = []
) -> tuple[np.ndarray]:
    # Load dataset
    df = pd.read_csv(data_file, index_col='row_index', nrows=n_rows)
    # Filtering out features
    df_filteredCol = df[selected_features]
    # Encoding categorical features
    encoder = Encoder(categorical_features=categorical_features,
                      categorical_encoding=categorical_encoding)
    df_encoded = encoder.encode(df_filteredCol)
    # Deal with NaNs
    df_encoded[date_feature] = days_since_start_of_2020(df_encoded[date_feature])
    df_cleaned = force_to_numeric(df_encoded, numerical_features_to_force)
    df_cleaned = drop_fill_na(df_cleaned)
    # Split into training and validation set
    X_train, X_val, y_train, y_val = split(df_cleaned,
                                           df['piezo_groundwater_level_category'],
                                           test_size=0.2,
                                           shuffle=True,
                                           random_state=42)
    return X_train, X_val, y_train, y_val

def load_train_valid_from_full_dataset(
        data_file: str,
        selected_features: list[str],
        date_feature: str,
        categorical_features: list[str],
        categorical_encoding: str = "labelEncoder",
        numerical_features_to_force: list[str] = []
) -> tuple[np.ndarray]:
    # Load dataset
    df = pd.read_csv(data_file, index_col='row_index')
    # Filtering out features
    df_filteredCol = df[selected_features]
    # Encoding categorical features
    encoder = Encoder(categorical_features=categorical_features,
                      categorical_encoding=categorical_encoding)
    df_encoded = encoder.encode(df_filteredCol)
    # Deal with NaNs
    df_encoded[date_feature] = days_since_start_of_2020(df_encoded[date_feature])
    df_cleaned = force_to_numeric(df_encoded, numerical_features_to_force)
    df_cleaned = drop_fill_na(df_cleaned)
    # Split into training and validation set
    X_train, X_val, y_train, y_val = split(df_cleaned,
                                           df['piezo_groundwater_level_category'],
                                           test_size=0.2,
                                           shuffle=True,
                                           random_state=42)
    return X_train, X_val, y_train, y_val

def load_summer_train_data(
        data_file: str,
        selected_features: list[str],
        date_feature: str,
        categorical_features: list[str],
        categorical_encoding: str = "labelEncoder",
        numerical_features_to_force: list[str] = []
) -> tuple[np.ndarray]:
    '''
        This function loads all spring, automn and winter data as training data and summer data as validation data.
    '''
    # Load dataset
    df = pd.read_csv(data_file, index_col='row_index')
    # Filter out summer rows 
    df_summer = filter_summers(df)
    # Filtering out features
    df_filteredCol = df_summer[selected_features]
    # Encoding categorical features
    encoder = Encoder(categorical_features=categorical_features,
                      categorical_encoding=categorical_encoding)
    df_encoded = encoder.encode(df_filteredCol)
    # Deal with NaNs
    df_encoded[date_feature] = days_since_start_of_2020(df_encoded[date_feature])
    df_cleaned = force_to_numeric(df_encoded, numerical_features_to_force)
    df_cleaned = drop_fill_na(df_cleaned)
    # Split into training and validation set
    X_train, X_val, y_train, y_val = split(df_cleaned,
                                           df['piezo_groundwater_level_category'],
                                           test_size=0.2,
                                           shuffle=True,
                                           random_state=42)
    return X_train, X_val, y_train, y_val

## Utility: Load functions for test

In [7]:
# Utility functions

def load_test_dataset(
        data_file: str,
        selected_features: list[str],
        date_feature: str,
        categorical_features: list[str],
        categorical_encoding: str = "labelEncoder",
        numerical_features_to_force: list[str] = []
) -> np.ndarray:
    # Load dataset
    df = pd.read_csv(data_file, index_col='row_index')
    # Filtering out features
    df_filteredCol = df[selected_features]
    # Encoding categorical features
    encoder = Encoder(categorical_features=categorical_features,
                      categorical_encoding=categorical_encoding)
    df_encoded = encoder.encode(df_filteredCol)
    # Deal with NaNs
    df_encoded[date_feature] = days_since_start_of_2020(df_encoded[date_feature])
    df_cleaned = force_to_numeric(df_encoded, numerical_features_to_force)
    df_cleaned = drop_fill_na(df_cleaned)
    return df_cleaned.to_numpy(), df_cleaned.index

# Load Data

In [8]:
# List all selected features
piezo_features = ['piezo_station_investigation_depth',
                  'piezo_station_altitude',
                  'piezo_station_longitude',
                  'piezo_station_latitude',
                  'piezo_measurement_date', # str -> date
                  'piezo_obtention_mode',   # categorical
                  'piezo_status',           # categorical
                  'piezo_qualification',    # categorical
                  'piezo_continuity_code',  # categorical
                  'piezo_measure_nature_code']  # categorical

meteo_features = ['meteo_frost_duration',
                  'meteo_evapotranspiration_grid',
                  'meteo_temperature_avg',
                  'meteo_time_tx',
                  'meteo_time_tn',
                  'meteo_temperature_avg_threshold',
                  'meteo_temperature_avg_tntm',
                  'meteo_amplitude_tn_tx',
                  'meteo_temperature_max',
                  'meteo_temperature_min',
                  'meteo_rain_height']

hydro_features = ['hydro_latitude',
                  'hydro_longitude',
                  'hydro_method_code',              # categorical
                  'hydro_observation_result_elab',
                  'hydro_qualification_code',       # categorical
                  'hydro_status_code']              # categorical

prelev_features = ['prelev_volume_0',
                   'prelev_volume_1',
                   'prelev_volume_2',
                   'prelev_usage_label_0',      # categorical
                   'prelev_usage_label_1',
                   'prelev_usage_label_2',
                   'prelev_other_volume_sum',
                   'prelev_latitude_0',
                   'prelev_latitude_1',
                   'prelev_latitude_2',
                   'prelev_longitude_0',
                   'prelev_longitude_1',
                   'prelev_longitude_2']

insee_features = ['insee_%_ind',
                  'insee_%_agri',
                  'insee_%_const',
                  'insee_med_living_level',
                  'insee_pop_commune']

distance_features = ['distance_piezo_meteo',
                     'distance_piezo_hydro']

# Keep only selected features
selected_features = piezo_features + meteo_features + hydro_features + prelev_features + insee_features + distance_features
date_feature = 'piezo_measurement_date'
categorical_features = ['piezo_obtention_mode',
                        'piezo_status',
                        'piezo_qualification',
                        'piezo_continuity_code',
                        'piezo_measure_nature_code',
                        'hydro_method_code',
                        'hydro_qualification_code',
                        'hydro_status_code',
                        'prelev_usage_label_0',
                        'prelev_usage_label_1',
                        'prelev_usage_label_2']
numerical_features_to_force = insee_features

In [9]:
# X_train, X_val, y_train, y_val = load_train_valid_from_tiny_dataset(
#     data_file='X_train.csv',
#     n_rows=100_000,
#     selected_features=selected_features,
#     date_feature=date_feature,
#     categorical_features=categorical_features,
#     numerical_features_to_force=numerical_features_to_force
# )
X_train, X_val, y_train, y_val = load_train_valid_from_full_dataset(
    data_file='X_train.csv',
    selected_features=selected_features,
    date_feature=date_feature,
    categorical_features=categorical_features,
    numerical_features_to_force=numerical_features_to_force
)
print('Done')

  df = pd.read_csv(data_file, index_col='row_index')
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = LabelEncoder().fit_transform(df[feature])
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = LabelEncoder().fit_transform(df[feature])
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be

Done


# Models and training

## Logistic Regression

In [10]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

# # Fit Logistic Regression
# logistic_regression = LogisticRegression().fit(X_train, y_train)

# # Prediction
# y_pred = logistic_regression.predict(X_val)

# # Confusion matrix
# labels_afterSplit = list(set(y_val))
# labels_asc = sorted(labels_afterSplit)
# cm = confusion_matrix(y_val, y_pred)            # Labels are ordered in the order of the given 'labels' argument

# # Metrics
# print(f"Labels order: {labels_asc}")
# print(f"Accuracy: {accuracy_score(y_val, y_pred)}")
# print(f"F1 score: {f1_score(y_val, y_pred, average=None)}")
# print(cm)

# # Classification report
# print(classification_report(y_val, y_pred))     # Labels are ordered in alphabetical order

## Random Forest

In [11]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

# # Fit Logistic Regression
# random_forest = RandomForestClassifier(max_depth=10).fit(X_train, y_train)

# # Prediction
# y_pred = random_forest.predict(X_val)

# # Confusion matrix
# labels_afterSplit = list(set(y_val))
# labels_asc = sorted(labels_afterSplit)
# cm = confusion_matrix(y_val, y_pred)            # Labels are ordered in the order of the given 'labels' argument

# # Metrics
# print(f"Labels order: {labels_asc}")
# print(f"Accuracy: {accuracy_score(y_val, y_pred)}")
# print(f"F1 score: {f1_score(y_val, y_pred, average=None)}")
# print(cm)

# # Classification report
# print(classification_report(y_val, y_pred))     # Labels are ordered in alphabetical order

## XGBoost

In [12]:
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

# Map label
map = {
    'Very High': 4,
    'High': 3,
    'Average': 2,
    'Low': 1,
    'Very Low': 0
}
y_train_num = np.array([map[target] for target in y_train])
y_val_num = np.array([map[target] for target in y_val])

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train_num)
dval = xgb.DMatrix(X_val, label=y_val_num)

# Fit Logistic Regression
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',num_class=5, n_estimators=100, learning_rate=0.1, max_depth=5)
xgb_classifier.fit(X_train, y_train_num)

# Prediction
y_pred_num = xgb_classifier.predict(X_val)

# Confusion matrix
labels_afterSplit = list(set(y_val))
labels_asc = sorted(labels_afterSplit)
cm = confusion_matrix(y_val_num, y_pred_num)            # Labels are ordered in the order of the given 'labels' argument

# Metrics
print(f"Labels order: {labels_asc}")
print(f"Accuracy: {accuracy_score(y_val_num, y_pred_num)}")
print(f"F1 score: {f1_score(y_val_num, y_pred_num, average=None)}")
print(cm)

# Classification report
print(classification_report(y_val_num, y_pred_num))     # Labels are ordered in alphabetical order

Labels order: ['Average', 'High', 'Low', 'Very High', 'Very Low']
Accuracy: 0.48423853133214617
F1 score: [0.59884948 0.45190244 0.42562145 0.42888554 0.54582764]
[[66937 26224  6958  5347  2072]
 [24425 60970 22619 13008  4796]
 [13390 30503 52291 24439  9329]
 [ 7542 17738 24952 48959 17163]
 [ 3720  8584  8944 20201 44953]]
              precision    recall  f1-score   support

           0       0.58      0.62      0.60    107538
           1       0.42      0.48      0.45    125818
           2       0.45      0.40      0.43    129952
           3       0.44      0.42      0.43    116354
           4       0.57      0.52      0.55     86402

    accuracy                           0.48    566064
   macro avg       0.49      0.49      0.49    566064
weighted avg       0.48      0.48      0.48    566064



# Prediction on test set

In [13]:
X_test, indices = load_test_dataset(
    data_file='X_test.csv',
    selected_features=selected_features,
    date_feature=date_feature,
    categorical_features=categorical_features,
    numerical_features_to_force=numerical_features_to_force
)
print('Done')
print(X_test.shape)
print(indices[:10])

  df = pd.read_csv(data_file, index_col='row_index')
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = LabelEncoder().fit_transform(df[feature])
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = LabelEncoder().fit_transform(df[feature])
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be

Done
(611208, 47)
Index([2331795, 2331796, 2331797, 2331798, 2331799, 2331800, 2331801, 2331802,
       2331803, 2331804],
      dtype='int64', name='row_index')


In [14]:
# Predictions
y_pred_num = xgb_classifier.predict(X_test)
print(y_pred_num.shape)
print(len(indices))

(611208,)
611208


In [15]:
def inverse_map(value: int, map: dict[int, str]) -> str:
    for k, valeur in map.items():
        if valeur == value:
            return k

In [16]:
y_pred = np.array([inverse_map(pred, map) for pred in y_pred_num])

results = pd.DataFrame(y_pred, columns=['piezo_groundwater_level_category'], index=indices)
print(results)

          piezo_groundwater_level_category
row_index                                 
2331795                                Low
2331796                                Low
2331797                                Low
2331798                            Average
2331799                                Low
...                                    ...
3610818                           Very Low
3610819                           Very Low
3610820                           Very Low
3610821                            Average
3610822                                Low

[611208 rows x 1 columns]


In [17]:
# Save csv
results.to_csv('y_submission_XGB_all.csv')