In [1]:
!pip install xgboost
!pip install lightgbm

import pandas as pd
import numpy as np

# Model importation 
import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier


# Data processing
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Prediction scores
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)



In [2]:
path_train = "~/hfactory_magic_folders/water_shortage_prediction/X_train_Hi5.csv"
path_test = "~/hfactory_magic_folders/water_shortage_prediction/X_test_Hi5.csv"

In [3]:
df = pd.read_csv(path_train, low_memory = False)

In [4]:
# Convert the column 'piezo_measurement_date' into datetime
df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])

# Add columns for year, month, season 
df['year'] = df['piezo_measurement_date'].dt.year
df['month'] = df['piezo_measurement_date'].dt.month
df['season'] = df['month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Summer', 10: 'Autumn', 11: 'Autumn'
})

In [5]:
numerical_columns = [
    'piezo_station_altitude', 'piezo_station_investigation_depth', 'meteo_evapotranspiration_grid',
    'meteo_cloudiness_height', 'meteo_wind_speed_avg_2m', 'meteo_temperature_avg',
    'meteo_humidity_avg', 'meteo_rain_height'
]

for column in numerical_columns:
    for i in range(1, 2):  
        new_name = f'prev_{i}_{column}'  
        df[new_name] = df[column].shift(i) 


In [6]:
seuil = 0.8
cols_to_drop = df.columns[(df.isna().sum() / len(df)) > seuil]
df.drop(columns=cols_to_drop, inplace=True)
#dropping was bad

'\nseuil = 0.8\ncols_to_drop = df.columns[(df.isna().sum() / len(df)) > seuil]\ndf.drop(columns=cols_to_drop, inplace=True)\n'

In [7]:
custom_mapping = {
    'Very Low': 0,
    'Low': 1,
    'Average': 2,
    'High': 3,
    'Very High': 4
}
reverse_mapping = {
    0: 'Very Low',
    1: 'Low',
    2: 'Average',
    3: 'High',
    4: 'Very High'
}

In [8]:
# categorical_columns =  df.select_dtypes(include=['object'])
# for col in categorical_columns:
#     le = LabelEncoder()
#     df[col] = le.fit_transform(df[col])

In [9]:
# Define the train and test set 
data_before_2022 = df[((df['year'] == 2020) | (df['year'] == 2021))]

X_train = data_before_2022[data_before_2022['piezo_measurement_date'] < '2021-06-01']
X_train = X_train.select_dtypes(exclude=['object'])
X_train.drop(columns = ["piezo_measurement_date"],inplace = True)
# X_train.drop(columns = ["piezo_groundwater_level_category"],inplace = True)

X_test = data_before_2022[(data_before_2022['piezo_measurement_date'] >= '2021-06-01')  & (data_before_2022['piezo_measurement_date'] < '2021-10-01')]
X_test = X_test.select_dtypes(exclude=['object'])
X_test.drop(columns = ["piezo_measurement_date"],inplace = True)
# X_test.drop(columns = ["piezo_groundwater_level_category"],inplace = True)

y_train = data_before_2022[data_before_2022['piezo_measurement_date'] < '2021-06-01']["piezo_groundwater_level_category"]
y_test =  data_before_2022[(data_before_2022['piezo_measurement_date'] >= '2021-06-01') & (data_before_2022['piezo_measurement_date'] < '2021-10-01')]["piezo_groundwater_level_category"]

In [10]:
y_train = y_train.map(custom_mapping)
y_test = y_test.map(custom_mapping)

In [11]:
# print(f"Nombre de NaN : {np.isnan(y_train).sum()}")
# print(f"Nombre de NaN : {np.isnan(y_test).sum()}")
y_train.head()

0    3
1    4
2    3
3    4
4    0
Name: piezo_groundwater_level_category, dtype: int64

In [12]:
#XGBClassifier
# model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# y_pred_labels = pd.Series(y_pred).map(reverse_mapping)
# print(y_pred_labels.shape)

# y_test_labels = y_test.map(reverse_mapping)
# print(y_test_labels.shape)
# # Compute the F1 Score

# f1 = f1_score(y_test_labels, y_pred_labels, average='weighted')
# print(f"F1-Score (weighted): {f1:.4f}")

In [13]:
X_train = X_train.drop(columns=['meteo_radiation_IR'])
X_test = X_test.drop(columns=['meteo_radiation_IR'])

imputer = SimpleImputer(strategy='mean')
imputed_data = imputer.fit_transform(X_train)

X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Vérifier qu'il n'y a plus de NaN après l'imputation
print(f"Nombre de NaN après imputation dans X_train : {X_train.isna().sum().sum()}")
print(f"Nombre de NaN après imputation dans X_test : {X_test.isna().sum().sum()}")

Nombre de NaN après imputation dans X_train : 0
Nombre de NaN après imputation dans X_test : 0


In [14]:
# Modèle Extra Trees
# model = ExtraTreesClassifier()
# model.fit(X_train, y_train)

# # Prédictions
# y_pred = model.predict(X_test)
# y_pred_labels = pd.Series(y_pred).map(reverse_mapping)
# print(y_pred_labels.shape)

# y_test_labels = y_test.map(reverse_mapping)
# print(y_test_labels.shape)

# # Calcul du F1-Score
# f1 = f1_score(y_test_labels, y_pred_labels, average='weighted')
# print(f"F1-Score (weighted): {f1:.4f}")

In [None]:
# Initialisation du modèle Random Forest
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Prédictions
y_pred = model.predict(X_test)
y_pred_labels = pd.Series(y_pred).map(reverse_mapping)
print(y_pred_labels.shape)

y_test_labels = y_test.map(reverse_mapping)
print(y_test_labels.shape)

# Calcul du F1-Score
f1 = f1_score(y_test_labels, y_pred_labels, average='weighted')
print(f"F1-Score (weighted): {f1:.4f}")