In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge
from src.data import preprocessing

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
params = preprocessing.dict_params
INPUT_FILE_PATH = './../data/interim'
OUTPUT_FILE_PATH = './../models'
INDEX_KEY = [0] # the 1st column is the primary key

### Default params

In [5]:
print(params)

{'synthese': {'file_name': 'Agribalyse_Synthese.csv', 'keep_cols': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'index_key': [0]}, 'ingredients': {'file_name': 'Agribalyse_Detail ingredient.csv', 'keep_cols': [0, 2, 3, 4, 5, 6, 7], 'index_key': [0], 'pivot_idx_key': [0, 1, 2, 3, 4], 'pivot_idx_col': 5, 'pivot_idx_values': 6, 'pivot_keep_cols': [0, 2, 3, 4, 5, 6, 7]}, 'etapes': {'file_name': 'Agribalyse_Detail etape.csv', 'keep_cols': [0, 8, 9, 10, 11, 12, 13], 'index_key': [0]}}


In [52]:
data_df_ori = pd.read_csv(os.path.join(INPUT_FILE_PATH, preprocessing.get_param('ingredients', 'file_name')))
data_df_ori.columns[13:30]

Index(['Abats de bœuf', 'Abats de porc', 'Abats de poulet', 'Abats de veau',
       'Abricot', 'Ail', 'Amande', 'Amidon de maïs', 'Ananas', 'Anchois',
       'Anchois commun', 'Asperges', 'Aubergine', 'Autres étapes', 'Avocat',
       'Avoine', 'Baie'],
      dtype='object')

In [68]:
def clean_data(df)->pd.DataFrame:
    # Drop columns
    drop_cols = ['Code_AGB', 'Code_CIQUAL', 'Groupe_aliment', 'Sous-groupe_aliment', 'Nom_Produit_Francais', 'LCI_Name',\
          'Saisonnalite', 'Transport_par_avion_', 'Livraison', 'Livraison', 'Materiau_emballage', 'Preparation', 'DQR_Note_qualite_la_donnee_']
    clean_df = df.drop(drop_cols, axis=1)

    # Drop lines with no ingredients (all 0)
    with_ing=clean_df.iloc[:,1:].sum(axis=1)!=0
   
    clean_df = clean_df[with_ing].copy()

    return clean_df

In [69]:
data_df = clean_data(data_df_ori)

In [70]:
print(f'Number of ingredients kept for training :{len(data_df.columns)}')
print(f'Number of products kept for training:{data_df.shape[0]}')

Number of ingredients kept for training :214
Number of products kept for training:1038


In [71]:
#data_df = data_df.dropna(axis=0)
y= data_df.loc[:,'Score_unique_EF_'].copy()
X= data_df.drop('Score_unique_EF_', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [77]:
print(f'X_train:{X_train.shape}')
print(f'y_train:{y_train.shape}')

X_train:(830, 213)
y_train:(830,)


### Build Pipeline

In [78]:
from sklearn.decomposition import TruncatedSVD

In [79]:
ingred_features = X_train.columns.to_list()
cat_features = []
num_features = []

# Transformer SVD
svd_transformer = Pipeline([
    ('svd', TruncatedSVD(n_components=37)),])

# transformer for numerical features
num_transformer = Pipeline([
        ('imputer_num', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ])
# transformer for categorical features
cat_transformer = Pipeline([
        ('imputer_cat', SimpleImputer(strategy = 'most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
        ('dim_reduction', svd_transformer, ingred_features),
        #('categoricals', cat_transformer, cat_features),
        #('numericals', num_transformer, num_features)
    ],
    remainder = 'drop'
)
full_pipeline = Pipeline([
        ('transformation', preprocessor),
        ('lin_reg', LinearRegression())
    ])

### Train Linear Regression model

In [80]:
svd_model = TruncatedSVD(n_components=37)

In [81]:
X_truncated = svd_model.fit_transform(X_train, y_train)

In [82]:
full_pipeline.fit_transform(X_train, y_train)

TypeError: '>=' not supported between instances of 'TruncatedSVD' and 'int'

### Predict on test set

In [None]:
y_pred = full_pipeline.predict(X_train)

from sklearn.metrics import r2_score