In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import  SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
#from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge
from src.data import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import TruncatedSVD
import os
import joblib

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
params = preprocessing.dict_params
INPUT_FILE_PATH = './../data/processed'
OUTPUT_FILE_PATH = './../models'
INDEX_KEY = [0] # the 1st column is the primary key

### Default params

In [4]:
print(params)

{'synthese': {'file_name': 'Agribalyse_Synthese.csv', 'keep_cols': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'index_key': [0]}, 'ingredients': {'file_name': 'Agribalyse_Detail ingredient.csv', 'keep_cols': [0, 2, 3, 4, 5, 6, 7], 'index_key': [0], 'pivot_idx_key': [0, 1, 2, 3, 4], 'pivot_idx_col': 5, 'pivot_idx_values': 6, 'pivot_keep_cols': [0, 2, 3, 4, 5, 6, 7]}, 'etapes': {'file_name': 'Agribalyse_Detail etape.csv', 'keep_cols': [0, 8, 9, 10, 11, 12, 13], 'index_key': [0]}}


In [5]:
data_df_ori = pd.read_csv(os.path.join(INPUT_FILE_PATH, preprocessing.get_param('ingredients', 'file_name')))
data_df_ori.columns[13:]

Index(['Abats de bœuf', 'Abats de porc', 'Abats de poulet', 'Abats de veau',
       'Abricot', 'Ail', 'Amande', 'Amidon de maïs', 'Ananas', 'Anchois',
       ...
       'max_EF_Viande de moutton sans os', 'max_EF_Viande de porc maigre',
       'max_EF_Viande de poulet sans os', 'max_EF_Viande de veau sans os',
       'max_EF_Vin blanc', 'max_EF_Vin rouge', 'max_EF_Yaourt',
       'max_EF_citron', 'max_EF_Échalote', 'max_EF_Œuf de poule'],
      dtype='object', length=639)

In [6]:
def clean_data(df)->pd.DataFrame:
    # Drop columns
    drop_cols = ['Code_AGB', 'Code_CIQUAL', 'Groupe_aliment', 'Sous-groupe_aliment', 'Nom_Produit_Francais', 'LCI_Name',\
          'Saisonnalite', 'Transport_par_avion_', 'Livraison', 'Livraison', 'Materiau_emballage', 'Preparation', 'DQR_Note_qualite_la_donnee_']
    clean_df = df.drop(drop_cols, axis=1)

    # Drop lines with no ingredients (all 0)
    with_ing=clean_df.iloc[:,1:].sum(axis=1)!=0
   
    clean_df = clean_df[with_ing].copy()

    return clean_df

In [7]:
data_df = clean_data(data_df_ori)

In [8]:
print(f'Number of ingredients kept for training :{len(data_df.columns)}')
print(f'Number of products kept for training:{data_df.shape[0]}')

Number of ingredients kept for training :640
Number of products kept for training:1038


In [9]:
#data_df = data_df.dropna(axis=0)
y= data_df.loc[:,'Score_unique_EF_'].copy()
X= data_df.drop('Score_unique_EF_', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [10]:
print(f'X_train:{X_train.shape}')
print(f'y_train:{y_train.shape}')

X_train:(830, 639)
y_train:(830,)


### Build Pipeline

In [11]:
from sklearn.decomposition import TruncatedSVD

In [12]:
ingred_features = X_train.columns.to_list()
num_features = X_train.select_dtypes(include=np.number)
cat_features = []

# Transformer SVD
svd_transformer = Pipeline([
    ('svd', TruncatedSVD(n_components=32)),])

# transformer for numerical features
num_transformer = Pipeline([
        ('imputer_num', SimpleImputer(strategy = 'median')),
        #('scaler', StandardScaler())
    ])
# transformer for categorical features
cat_transformer = Pipeline([
        ('imputer_cat', SimpleImputer(strategy = 'most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
        ('categoricals', cat_transformer, cat_features),
        ('numericals', num_transformer, num_features),
         ('dim_reduction', svd_transformer, ingred_features)
    ],
    remainder = 'drop'
)


### Train Linear Regression model and then predict on test set

In [13]:
def fit_predict(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_train)
    print(f'Train - R2 Score: {r2_score(y_train, y_pred)}')

In [14]:
full_pipeline = Pipeline([
        ('transformation', preprocessor),
        ('regressor', LinearRegression())
    ])
fit_predict(full_pipeline, X_train, y_train)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

In [None]:
r2_score(y_test, full_pipeline.predict(X_test))

### Predict on test set

In [None]:
full_pipeline = Pipeline([
        ('transformation', preprocessor),
        ('random_forest', RandomForestRegressor(max_depth=10))
    ])
fit_predict(full_pipeline, X_train, y_train)

In [None]:
y_pred = full_pipeline.predict(X_test)
r2_score(y_test, y_pred)

In [None]:
MODELS_FOLDER = './../models'
joblib.dump(full_pipeline, os.path.join(MODELS_FOLDER, "score_predictor_v2.joblib"))