<a href="https://colab.research.google.com/github/mehdi-mustapha/PROJETS-JEDHA/blob/main/understand_model_with_shapash.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
os.chdir('/content/drive/MyDrive/111_goldfinger/data_modeling')

In [None]:
!pip install shapash

In [None]:
!pip install matplotlib==3.4
!pip install sklearn

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore") # to avoid deprecation warnings
import sys


#Graph libraries

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
%matplotlib inline


#Preprocessing libraries

from sklearn.model_selection import train_test_split


#Model Selection

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [None]:
#Loading dataset
df = pd.read_csv('/content/drive/MyDrive/111_goldfinger/data_modeling/tech.csv')

In [None]:
df.sample()

In [None]:
df = df.drop(columns=['Unnamed: 0'])

In [None]:
df = df[['content_len', 'title_len_char', 'h1_len', 'nb_h3', 'nb_links',
       'has_canonical', 'is_top_ten', 'content_score', 'title_score',
      'ref_sn']]

In [None]:
df.dropna(subset=['ref_sn'], inplace=True)

In [None]:
df = df[df.ref_sn != 0]

#**Part II Preprocessing & model selection**
##**Pipelines**
Splitting features & target

In [None]:
target = 'is_top_ten'

X = df.loc[:, df.columns != target]
Y = df.loc[:,target] 

Y = Y.apply(lambda x: float(x[1:]) if type(x)==str else x)

#Split the data into a train set and test set 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

print("...train test split Done !")

#**PART II : Model fitting**

In [None]:
#Instanciate models

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score

model = RandomForestClassifier(max_depth= 15, 
                                  n_estimators= 300
                                  ,max_leaf_nodes=1024)
model.fit(X_train, Y_train)
Y_train_pred    = model.predict(X_train)
Y_test_pred     = model.predict(X_test)
score           = model.score(X_test, Y_test)
accuracy_test   = accuracy_score(Y_test, Y_test_pred)
recall_test     = recall_score(Y_test, Y_test_pred)
f1_score_test   = f1_score(Y_test, Y_test_pred)
print('model : ', model, ' - score : ', score,  'accuracy_test : ', accuracy_test, 'recall : ', recall_test, 'f1_score_test' , f1_score_test)

#**Understand my model with shapash**

Declare and Compile SmartExplainer

In [None]:
from shapash.explainer.smart_explainer import SmartExplainer

In [None]:
xpl = SmartExplainer()

In [None]:
Y_pred = pd.DataFrame(model.fit(X_train, Y_train).predict(X_test),columns=['pred'],index=X_test.index)

In [None]:
xpl.compile(
    x       = X_test,
    model   = model.fit(X_train, Y_train),
    y_pred  = Y_pred
            )

In [None]:
xpl.plot.features_importance()

**Understand how a feature contributes**

In [None]:
xpl.plot.contribution_plot("content_len")

In [None]:
xpl.plot.contribution_plot('nb_links')

In [None]:
xpl.plot.contribution_plot('ref_sn')

In [None]:
xpl.add(y_pred=Y_pred)
xpl.plot.contribution_plot(col='content_score')

In [None]:
xpl.plot.top_interactions_plot(nb_top_interactions=5)