In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import  SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
#from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge
from src.data import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import TruncatedSVD
import os
import joblib

In [5]:
MODELS_FILE_PATH = './../models'
INTERIM_FILE_PATH = './../data/interim'
INPUT_DATA_FILE_PATH = './../data/processed/'

In [6]:
score_predictor = joblib.load(os.path.join(MODELS_FILE_PATH,'score_predictor_v2.joblib'))
canonical_df = pd.read_csv(os.path.join(INPUT_DATA_FILE_PATH,'ingredients_data_format.csv'))
statistics_df = pd.read_csv(os.path.join(INTERIM_FILE_PATH,'Agribalyse_MinMax ingredient.csv'))

### Ingredients and their min and max EF scores

In [7]:
print(statistics_df.shape)
statistics_df.head(5)

(213, 3)


Unnamed: 0,Ingredients,min_EF,max_EF
0,Abats de bœuf,0.365335,1.976721
1,Abats de porc,0.008707,0.22866
2,Abats de poulet,0.003403,0.049212
3,Abats de veau,0.398916,0.398916
4,Abricot,0.004451,0.046987


#### The canonical datafram with zeros.  Used as a molde to set the values coming from the user

In [8]:
canonical_df

Unnamed: 0,Abats de bœuf,Abats de porc,Abats de poulet,Abats de veau,Abricot,Ail,Amande,Amidon de maïs,Ananas,Anchois,...,max_EF_Viande de moutton sans os,max_EF_Viande de porc maigre,max_EF_Viande de poulet sans os,max_EF_Viande de veau sans os,max_EF_Vin blanc,max_EF_Vin rouge,max_EF_Yaourt,max_EF_citron,max_EF_Échalote,max_EF_Œuf de poule
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### /!\ Ingredients with no statistics (TODO remove them from preprocessing)

In [9]:
statistics_df[(statistics_df['min_EF'] == 0) | (statistics_df['max_EF'] == 0)]

Unnamed: 0,Ingredients,min_EF,max_EF
24,Bouillon,0.0,0.0
72,Fond de veau pour la sauce et la cuisson,0.0,0.0
132,Miel,0.0,0.0


#### Transposed statistics of ingredients to have them in columns

In [10]:
user_input = ['Amande', 'Abats de veau','Abricot']

In [12]:
#metric_df.loc[user_input, :]

In [13]:
def get_transposed(statistics_df, metric, prefix:str):
    metric_df =  statistics_df.set_index('Ingredients')[[metric]].transpose().reset_index(drop=True)
    new_col_names = [f'{prefix}{c}' for c in metric_df.columns]
    metric_df.columns = new_col_names
    return metric_df

In [14]:
def generate_transposed_statistics(df):
    #transpose minEF
    minEF_df = get_transposed(df, 'min_EF', prefix='min_EF_')
    
    #transpose maxEF
    maxEF_df = get_transposed(df, 'max_EF', 'max_EF_')
    
    #Concatenate minEF and maxEF
    return pd.concat([minEF_df, maxEF_df], axis=1)
    

In [15]:
def format_ingredients_list(user_input, canonical_df, statistics_df)->pd.DataFrame:
    statistics_user_input = generate_transposed_statistics(statistics_df.set_index('Ingredients').loc[user_input,:].reset_index())
    canonical_df.loc[:,statistics_user_input.columns] = statistics_user_input
    canonical_df.loc[:,user_input] = 1

In [16]:
stats_user_input = generate_transposed_statistics(statistics_df.set_index('Ingredients').loc[user_input,:].reset_index())
stats_user_input

Unnamed: 0,min_EF_Amande,min_EF_Abats de veau,min_EF_Abricot,max_EF_Amande,max_EF_Abats de veau,max_EF_Abricot
0,0.108593,0.398916,0.004451,1.072579,0.398916,0.046987


* In the canonical_df, set values for statistics

In [17]:
canonical_df.loc[:,stats_user_input.columns] = stats_user_input

In [18]:
for c in canonical_df.columns:
    if sum(canonical_df[c].values)>0:
        print(c)

min_EF_Abats de veau
min_EF_Abricot
min_EF_Amande
max_EF_Abats de veau
max_EF_Abricot
max_EF_Amande


* Set values to 1 where the ingredient is present in user's input

In [19]:
canonical_df.loc[:,user_input] = 1

In [20]:
for c in canonical_df.columns:
    if sum(canonical_df[c].values)>0:
        print(c)

Abats de veau
Abricot
Amande
min_EF_Abats de veau
min_EF_Abricot
min_EF_Amande
max_EF_Abats de veau
max_EF_Abricot
max_EF_Amande


In [21]:
canonical_df

Unnamed: 0,Abats de bœuf,Abats de porc,Abats de poulet,Abats de veau,Abricot,Ail,Amande,Amidon de maïs,Ananas,Anchois,...,max_EF_Viande de moutton sans os,max_EF_Viande de porc maigre,max_EF_Viande de poulet sans os,max_EF_Viande de veau sans os,max_EF_Vin blanc,max_EF_Vin rouge,max_EF_Yaourt,max_EF_citron,max_EF_Échalote,max_EF_Œuf de poule
0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#format_ingredients_list(user_input, canonical_df, statistics_df)
canonical_df

Unnamed: 0,Abats de bœuf,Abats de porc,Abats de poulet,Abats de veau,Abricot,Ail,Amande,Amidon de maïs,Ananas,Anchois,...,max_EF_Viande de moutton sans os,max_EF_Viande de porc maigre,max_EF_Viande de poulet sans os,max_EF_Viande de veau sans os,max_EF_Vin blanc,max_EF_Vin rouge,max_EF_Yaourt,max_EF_citron,max_EF_Échalote,max_EF_Œuf de poule
0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
formatted_input

NameError: name 'formatted_input' is not defined

In [None]:
#score_predictor.predict(canonical_df)

In [None]:
score_predictor.predict(canonical_df)