In [23]:
# Install missing packages
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer
import numpy as np

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mbd2023-food-safety-controls/food_safety_sample_submission.csv
/kaggle/input/mbd2023-food-safety-controls/food_safety_train.csv
/kaggle/input/mbd2023-food-safety-controls/food_safety_test.csv


In [2]:
path = '/kaggle/input/mbd2023-food-safety-controls/'

### 1. Import and prepare data

#### 1.1. Train data

In [3]:
# Read train data
train = pd.read_csv(path + 'food_safety_train.csv', encoding='latin1')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22671 entries, 0 to 22670
Data columns (total 14 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   trustii_id                          22671 non-null  int64 
 1   APP_Libelle_etablissement           22670 non-null  object
 2   SIRET                               22671 non-null  object
 3   Adresse_2_UA                        22359 non-null  object
 4   Code_postal                         22671 non-null  object
 5   Libelle_commune                     22671 non-null  object
 6   Numero_inspection                   22671 non-null  object
 7   Date_inspection                     22671 non-null  object
 8   APP_Libelle_activite_etablissement  22671 non-null  object
 9   Agrement                            5823 non-null   object
 10  geores                              22064 non-null  object
 11  filtre                              16800 non-null  ob

In [5]:
# Show the data
train.head()

Unnamed: 0,trustii_id,APP_Libelle_etablissement,SIRET,Adresse_2_UA,Code_postal,Libelle_commune,Numero_inspection,Date_inspection,APP_Libelle_activite_etablissement,Agrement,geores,filtre,ods_type_activite,Synthese_eval_sanit
0,10000,SAVEURS ASIATIQUES,50846842800034,RUE DES FRERES LUMIERE,33130,Bègles,23-110093-1,2024-02-05T01:00:00+01:00,Restaurant,,44.797031_-0.535231,Restaurant,Autres,Satisfaisant
1,10001,EPICERIE ROND POINT,90472398800013,48 AV DU ROND POINT,93250,Villemomble,23-056603-1,2023-07-17T02:00:00+02:00,Libre service|Alimentation générale,,48.884745_2.499984,Libre service|Alimentation générale,Autres,Satisfaisant
2,10002,AUBERGE DES OLIVIERS,48792443300013,ROUTE DE LA CANONICA,20290,Lucciana,23-018114-1,2023-03-08T01:00:00+01:00,Restaurant,,42.541715_9.46286,Restaurant,Autres,Très satisfaisant
3,10003,LES HUITRES DES FLOTS,80481822700022,RUE DU PONT DES BERNES,50550,Saint-Vaast-la-Hougue,23-011747-1,2023-02-15T01:00:00+01:00,Purification/Expédition de coquillages,50562016.0,49.592002_-1.28678,,Produits de la mer et d'eau douce,Satisfaisant
4,10004,COLONNA DOMINIQUE-ANTOINE,53444598600019,Partinello,20147,Partinello,23-097885-1,2023-11-21T01:00:00+01:00,Producteur fermier,,42.306077_8.67833,Producteur fermier,Autres,Très satisfaisant


**Separate variables by type.**

In [6]:
# Target variable
dv_var = train['Synthese_eval_sanit']

# List of predictors
iv_vars = ['Code_postal', 'Libelle_commune', 'APP_Libelle_activite_etablissement', 'filtre', 'ods_type_activite']

# Splitting into numerical and categorical variables
num_vars = [col for col in iv_vars if pd.api.types.is_numeric_dtype(train[col])]
cat_vars = [col for col in iv_vars if not pd.api.types.is_numeric_dtype(train[col])]


In [7]:
num_vars, cat_vars

([],
 ['Code_postal',
  'Libelle_commune',
  'APP_Libelle_activite_etablissement',
  'filtre',
  'ods_type_activite'])

**Check and correct data errors by variable type.**

In [8]:
# Check missing values for each column in 'iv_vars'
missing_values = train[iv_vars].isnull().sum()

# Print total NAs across selected variables
print(f"Total NAs: {missing_values.sum()}")

# Print missing values for each variable
print(missing_values)

Total NAs: 5871
Code_postal                              0
Libelle_commune                          0
APP_Libelle_activite_etablissement       0
filtre                                5871
ods_type_activite                        0
dtype: int64


In [9]:
# Check the missing values for target variable

dv_var.isnull().sum()

0

In [10]:
# Filling missing values for numeric variables with 0
for var in num_vars:
    train[var].fillna(0, inplace=True)

# Filling missing values for categorical variables with 'Missing'
for var in cat_vars:
    train[var].fillna('Missing', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[var].fillna('Missing', inplace=True)


**Transform some variables by type.**

In [11]:
# To keep only the first two digits of the 'Code_postal' column
train['Code_postal'] = train['Code_postal'].astype(str).str.slice(0, 2)

In [12]:

encoder = OneHotEncoder(sparse=False, drop='first')  # Use sparse_output=False if the version warning applies
train_cat_dummies = encoder.fit_transform(train[cat_vars])

# Convert to DataFrame and correct column names
train_cat_dummies = pd.DataFrame(train_cat_dummies, columns=encoder.get_feature_names_out(cat_vars))
train_cat_dummies.columns = train_cat_dummies.columns.str.replace('[^A-Za-z0-9]', '_', regex=True)

# Combine with numerical variables and target variable
# Make sure dv_var is correctly accessing the target variable column
train_processed = pd.concat([train[num_vars].reset_index(drop=True), 
                             train_cat_dummies, 
                             dv_var.reset_index(drop=True)], axis=1)




In [13]:
# Check train data
train_processed.head()

Unnamed: 0,Code_postal_01,Code_postal_02,Code_postal_03,Code_postal_04,Code_postal_05,Code_postal_06,Code_postal_07,Code_postal_08,Code_postal_09,Code_postal_10,...,filtre_Restauration_collective,filtre_Traiteur,filtre_Traiteur_Boucherie_Charcuterie,ods_type_activite_Autres,ods_type_activite_Lait_et_produits_laitiers,ods_type_activite_Produits_de_la_mer_et_d_eau_douce,ods_type_activite_Transport_et_entreposage_de_denr_es_alimentaires,ods_type_activite_Viandes_et_produits_carn_s,ods_type_activite__ufs_et_ovoproduits,Synthese_eval_sanit
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Satisfaisant
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Satisfaisant
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Très satisfaisant
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Satisfaisant
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Très satisfaisant


#### 1.2. Test data

In [14]:
# Read train data
test = pd.read_csv(path + 'food_safety_test.csv', encoding='latin1')

In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8839 entries, 0 to 8838
Data columns (total 13 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   trustii_id                          8839 non-null   int64 
 1   APP_Libelle_etablissement           8839 non-null   object
 2   SIRET                               8839 non-null   object
 3   Adresse_2_UA                        8705 non-null   object
 4   Code_postal                         8839 non-null   object
 5   Libelle_commune                     8839 non-null   object
 6   Numero_inspection                   8839 non-null   object
 7   Date_inspection                     8839 non-null   object
 8   APP_Libelle_activite_etablissement  8839 non-null   object
 9   Agrement                            2247 non-null   object
 10  geores                              8612 non-null   object
 11  filtre                              6608 non-null   obje

**Check and correct data errors by variable type.**

In [16]:
# Check missing values for each column in 'iv_vars'
missing_values = test[iv_vars].isnull().sum()

# Print total NAs across selected variables
print(f"Total NAs: {missing_values.sum()}")

# Print missing values for each variable
print(missing_values)

Total NAs: 2231
Code_postal                              0
Libelle_commune                          0
APP_Libelle_activite_etablissement       0
filtre                                2231
ods_type_activite                        0
dtype: int64


In [17]:
# Filling missing values for numeric variables with 0
for var in num_vars:
    test[var].fillna(0, inplace=True)

# Filling missing values for categorical variables with 'Missing'
for var in cat_vars:
    test[var].fillna('Missing', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[var].fillna('Missing', inplace=True)


**Transform some variables by type.**

In [18]:
# To keep only the first two digits of the 'Code_postal' column
test['Code_postal'] = test['Code_postal'].astype(str).str.slice(0, 2)

In [19]:
# Adjust the encoder initialization
encoder = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')

test_cat_dummies = encoder.fit_transform(test[cat_vars])

# Convert to DataFrame and correct column names
test_cat_dummies = pd.DataFrame(test_cat_dummies, columns=encoder.get_feature_names_out(cat_vars))
test_cat_dummies.columns = test_cat_dummies.columns.str.replace('[^A-Za-z0-9]', '_', regex=True)

# Combine with numerical variables from the test set
test_processed = pd.concat([test[num_vars].reset_index(drop=True), test_cat_dummies], axis=1)




In [20]:
test_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8839 entries, 0 to 8838
Columns: 4176 entries, Code_postal_01 to ods_type_activite__ufs_et_ovoproduits
dtypes: float64(4176)
memory usage: 281.6 MB


In [21]:
test_processed.head()

Unnamed: 0,Code_postal_01,Code_postal_02,Code_postal_03,Code_postal_04,Code_postal_05,Code_postal_06,Code_postal_07,Code_postal_08,Code_postal_09,Code_postal_0_,...,filtre_Restaurant,filtre_Restauration_collective,filtre_Traiteur,filtre_Traiteur_Boucherie_Charcuterie,ods_type_activite_Autres,ods_type_activite_Lait_et_produits_laitiers,ods_type_activite_Produits_de_la_mer_et_d_eau_douce,ods_type_activite_Transport_et_entreposage_de_denr_es_alimentaires,ods_type_activite_Viandes_et_produits_carn_s,ods_type_activite__ufs_et_ovoproduits
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### 1.3. Finalize train, test

**Remove independent variables that exist only in train or test.**

In [22]:
dv_var_name = 'Synthese_eval_sanit'  # Directly using the column name as a string

# Identify columns to drop based on the names, excluding the target variable's column name
drop_vars = set(train_processed.columns) - set(test_processed.columns) - {dv_var_name}
print(f"Drop: {len(drop_vars)}, {list(drop_vars)}")

# Remove identified variables from the train and test DataFrames
train_processed.drop(columns=drop_vars, inplace=True)
# For test_processed, using errors='ignore' in case some drop_vars don't exist in it
test_processed.drop(columns=drop_vars, inplace=True, errors='ignore')


Drop: 4530, ['Libelle_commune_Gueugnon', 'Libelle_commune_Plumelec', 'Libelle_commune_Saint_Sa_ns', 'APP_Libelle_activite_etablissement_Fromagerie_Libre_service_Alimentation_g_n_rale_Traiteur', 'Libelle_commune_Saint_Chabrais', 'Libelle_commune_La_Sabotterie', 'Libelle_commune_Saint_Julien_les_Rosiers', 'Libelle_commune_Neuves_Maisons', 'Libelle_commune_M_rindol', 'Libelle_commune_Boscamnant', 'Libelle_commune_Saint_Germain_les_Vergnes', 'Libelle_commune_Domart_en_Ponthieu', 'Libelle_commune__le_d_Houat', 'Libelle_commune_Saint_Priest_la_Marche', 'Libelle_commune_Voiron', 'Libelle_commune_Menet', 'Libelle_commune_Bainville_sur_Madon', 'Libelle_commune_Servi_res_le_Ch_teau', 'Libelle_commune_Coeur_de_Causse', 'Libelle_commune_Saint_Marcel_l_s_Valence', 'Libelle_commune_Lauris', 'Libelle_commune_Saint_Sauveur_en_Puisaye', 'Libelle_commune_Croix', 'Libelle_commune_Gannay_sur_Loire', 'Libelle_commune_Meuzac', 'Libelle_commune_Vittoncourt', 'Libelle_commune_Pont_l_Abb_', 'Libelle_commune_Br

### 2. Train ML model

Reference:
- mlr integrated learners: https://mlr.mlr-org.com/articles/tutorial/integrated_learners.html

#### Decision Tree

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
import pandas as pd


X_train = train_processed.drop(columns=[dv_var_name])
y_train = train_processed[dv_var_name]

# Define the model
clf = DecisionTreeClassifier()

# Set hyperparameter tuning grid
param_grid = {
    'max_depth': [2, 5, 10],
    'min_samples_split': [10, 20]
}

# Set up cross-validation with hyperparameter tuning
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

# Run the hyperparameter tuning with k-fold CV
grid_search.fit(X_train, y_train)

# Extract the best model
best_model = grid_search.best_estimator_

# Optionally, you can check cross-validation scores for the best model
# Here we use 'accuracy' as the metric; adjust as needed for your case
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')

print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"CV Accuracy for best model: {cv_scores.mean()}")


Best hyperparameters: {'max_depth': 10, 'min_samples_split': 20}
CV Accuracy for best model: 0.6316436246242405


In [34]:
best_model

In [39]:

# Define the mapping from category to integer
category_to_int = {
    'Très satisfaisant': 1,
    'Satisfaisant': 2,
    'A améliorer': 3,
    'A corriger de manière urgente': 4
}

# Convert the 'Synthese_eval_sanit' column to integers based on the mapping
train_processed['Synthese_eval_sanit'] = train_processed['Synthese_eval_sanit'].map(category_to_int)

# Check the conversion
train_processed.head()


Unnamed: 0,Code_postal_01,Code_postal_02,Code_postal_03,Code_postal_04,Code_postal_05,Code_postal_06,Code_postal_07,Code_postal_08,Code_postal_09,Code_postal_10,...,filtre_Restauration_collective,filtre_Traiteur,filtre_Traiteur_Boucherie_Charcuterie,ods_type_activite_Autres,ods_type_activite_Lait_et_produits_laitiers,ods_type_activite_Produits_de_la_mer_et_d_eau_douce,ods_type_activite_Transport_et_entreposage_de_denr_es_alimentaires,ods_type_activite_Viandes_et_produits_carn_s,ods_type_activite__ufs_et_ovoproduits,Synthese_eval_sanit
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,


In [36]:
# Ensure test set has all the columns from the training set, filled with zeros where necessary
for column in X_train.columns:
    if column not in test_processed.columns:
        test_processed[column] = 0  # or the appropriate fill value for your context

# Reorder test_processed columns to match X_train
test_processed = test_processed[X_train.columns]

# Now you can safely make predictions, ensuring the columns match
X_test = test_processed.drop(columns=[dv_var_name], errors='ignore')
predictions = best_model.predict(X_test)

# If desired, attach predictions back to the test_processed DataFrame
test_processed['predictions'] = predictions


In [37]:
# Display or use the predictions
print(predictions)

['Satisfaisant' 'Satisfaisant' 'Satisfaisant' ... 'Satisfaisant'
 'Satisfaisant' 'Satisfaisant']


In [35]:
# Prepare test features, ensuring the test set does not contain the target variable column
X_test = test_processed.drop(columns=[dv_var_name], errors='ignore')  # Replace 'dv_var_name' with the name of your target variable

# Make predictions on the test set
predictions = best_model.predict(X_test)

# If you want to attach these predictions to the test DataFrame to see them alongside any actual values or for further analysis
test_processed['predictions'] = predictions

# Display or use the predictions
print(predictions)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- APP_Libelle_activite_etablissement_Alimentation_g_n_rale_Boulangerie_P_tisserie_Glacier_Poissonnerie_Traiteur_Boucherie_Charcuterie
- APP_Libelle_activite_etablissement_Alimentation_g_n_rale_Poissonnerie_Traiteur
- APP_Libelle_activite_etablissement_Fromagerie_Alimentation_g_n_rale_Boulangerie_P_tisserie_Boucherie_Charcuterie
- APP_Libelle_activite_etablissement_Fromagerie_Boulangerie_P_tisserie_Traiteur
- APP_Libelle_activite_etablissement_Fromagerie_Chocolatier
- ...
