In [1]:
%load_ext autoreload
%autoreload 2
    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.linear_model import Ridge
from lineartree import LinearTreeRegressor
from lib_FM import *
import random
import time
import pickle
from warnings import simplefilter

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
random.seed(10)

### READING THE DATASET

In [2]:
df=pd.read_csv('./fairlymade_products_impacts.csv')
df.columns

Index(['study_id', 'product_type', 'nb_components', 'composition',
       'raw_material_country', 'weaving_country', 'dyeing_country',
       'manufacturing_country', 'plane_in_transports', 'climate_change',
       'resource_use_fossils'],
      dtype='object')

### ENCODING CATEGORICAL VARIABLES AND BUILDING TRAIN/VALIDATION DATASETS

In [3]:
categorical_features=['product_type','nb_components']
multi_valued_features=['composition','raw_material_country','weaving_country','dyeing_country','manufacturing_country']
boolean_features=['plane_in_transports']
numerical_features=['resource_use_fossils']

FED=FeaturesEncoderDecoder(categorical_features=categorical_features,
                           multi_valued_features=multi_valued_features,
                           boolean_features=boolean_features,
                           numerical_features=numerical_features)

In [4]:
target_feature="climate_change"
df=df.dropna(subset=[target_feature]+numerical_features)
y=np.array(df[target_feature])

In [5]:
df=df.drop_duplicates()
t1=time.time()
X=FED.encode_dataframe(df,compute_labels=True)
t2=time.time()
print("encoding dataset took %f"%(t2-t1))


encoding dataset took 0.804410


In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25)
target_standard_deviation=np.std(y_train)


### TRAINING LINEAR MODEL TREES

In [7]:
best_score=None
for alpha in [0.01,1.,100.]:
    for min_samples_leaf in [0.0001,0.001,0.1]:
        for min_impurity_decrease_coeff in [0.,0.0001,0.001,0.1]:
            reg=LinearTreeRegressor(base_estimator=Ridge(alpha=alpha),
                                    min_samples_leaf=min_samples_leaf,
                                    min_impurity_decrease=min_impurity_decrease_coeff*target_standard_deviation,
                                    categorical_features=np.arange(FED.n_binary_features))
            scores=cross_val_score(reg,X_train,y_train)
            score=np.mean(scores)
            if best_score is None or best_score<score:
                best_score=score
                best_reg=reg
            print("alpha %f, min_samples_leaf %f, min_impurity_decrease_coeff %f : cv %s mean %f"%(alpha,min_samples_leaf,min_impurity_decrease_coeff,str(scores),score))
reg=best_reg

alpha 0.010000, min_samples_leaf 0.000100, min_impurity_decrease_coeff 0.000000 : cv [0.91801321 0.91351274 0.8395818  0.89297885 0.88573244] mean 0.889964
alpha 0.010000, min_samples_leaf 0.000100, min_impurity_decrease_coeff 0.000100 : cv [0.91801321 0.91351274 0.8395818  0.89297885 0.88573244] mean 0.889964
alpha 0.010000, min_samples_leaf 0.000100, min_impurity_decrease_coeff 0.001000 : cv [0.91801321 0.91351274 0.8395818  0.89297885 0.88573244] mean 0.889964
alpha 0.010000, min_samples_leaf 0.000100, min_impurity_decrease_coeff 0.100000 : cv [0.91801321 0.9135096  0.8395818  0.89297885 0.88573244] mean 0.889963
alpha 0.010000, min_samples_leaf 0.001000, min_impurity_decrease_coeff 0.000000 : cv [0.91773644 0.91405664 0.84725268 0.8930081  0.88340366] mean 0.891092
alpha 0.010000, min_samples_leaf 0.001000, min_impurity_decrease_coeff 0.000100 : cv [0.91773644 0.91405664 0.84725268 0.8930081  0.88340366] mean 0.891092
alpha 0.010000, min_samples_leaf 0.001000, min_impurity_decrease

In [9]:
reg.fit(X_train,y_train)
reg.score(X_val,y_val),reg.score(X_train,y_train)

(0.9136298727292305, 0.9153603911761565)