<a href="https://colab.research.google.com/github/mehdi-mustapha/PROJETS-JEDHA/blob/main/model_with_semantic_and_tech_attributes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Imports**

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
os.chdir('/content/drive/MyDrive/111_goldfinger/data_modeling')

In [None]:
!pip install lightgbm
!pip install catboost

In [None]:
!pip install matplotlib==3.4
!pip install sklearn
!pip install missingpy
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore") # to avoid deprecation warnings
import sys


#Graph libraries

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
%matplotlib inline


#Preprocessing libraries

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

#Model Selection
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import xgboost as xgb

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from missingpy import KNNImputer

In [None]:
#Loading dataset
df = pd.read_csv('/content/drive/MyDrive/111_goldfinger/data_modeling/tech.csv')

In [None]:
df.sample()

In [None]:
df = df.drop(columns=['Unnamed: 0'])

In [None]:
df.sample()

## **Distribution plot**

In [None]:
plt.rcParams['figure.figsize'] = [6, 2]
%matplotlib inline

# sample time series data
df2 = df.copy()


# create distplots
for c in df2.columns:
    plt.figure()             # <==================== here!
    sns.distplot(df2[c])

## **Plot pairwise relationships in the dataset**

In [None]:
sns.pairplot(df,corner=True, diag_kind="kde");

# **Part II Preprocessing & model selection**

## Detection of highly correlated features

In [None]:
# Plotting a matrix correlation of features
corr = df.corr()
plt.figure(figsize=(16,12))
sns.heatmap(corr, annot=True, cmap='YlGnBu');

In [None]:
#Let's identify most correlated features :
correlations = corr.unstack().reset_index()
correlations = correlations.rename(columns=dict(zip(correlations.columns, ['feature1', 'feature2', 'coeff'])))
correlations['coeff'] = correlations['coeff'].apply(lambda x : abs(x))

#Filtering features with a high correlation : 
top_correlations = correlations[(correlations.coeff !=1) &  (correlations.coeff > 0.8)].drop_duplicates(subset=['coeff'])
top_correlations

In [None]:
to_delete = top_correlations.feature2.values
to_delete

In [None]:
#Conclusion : let's drop the features which appear to be to much correlated with others !
df.drop(columns=to_delete, inplace=True)

## Pipelines

In [None]:
df.info()

In [None]:
# Missing values
nan_features = pd.Series(100* df.isnull().sum().sort_values(ascending=False) / df.shape[0]).to_frame()
nan_features.rename(columns={0:'nan_percent'}, inplace=True)
nan_features[nan_features.nan_percent > 0]

In [None]:
df.shape

In [None]:
nan_features[nan_features.nan_percent > 0].index

In [None]:
#delete records with nan values
df = df.dropna(subset=nan_features[nan_features.nan_percent > 0].index)

In [None]:
df.shape

Different tests have been done on the selection of features to set-up the best model. 
Finally, we discard from the analysis the tech spent and the majestic rank (based on top 1M of websites wordwide which cannot substitute the 'Authority Score').
One of the improvement paths in the future could indeed be to test the integration of external data (ahref, semrush).

In [None]:
df = df[['content_len', 'title_len_char', 'h1_len', 'nb_h3', 'nb_links',
       'has_canonical', 'is_top_ten', 'content_score', 'title_score',
      'ref_sn']]

Splitting features & target

In [None]:
target = 'is_top_ten'

X = df.loc[:, df.columns != target]
Y = df.loc[:,target] 

Y = Y.apply(lambda x: float(x[1:]) if type(x)==str else x)

#Split the data into a train set and test set 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)


# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.tolist()
Y_test = Y_test.tolist()
print("...Numpy conversion to arrays Done !")

In [None]:
# Automatically detect positions of numeric/categorical features :
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
        numeric_indices.append(idx)
    else :
        categorical_features.append(i)
        categorical_indices.append(idx)

    idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

In [None]:
from missingpy import KNNImputer
imputer = KNNImputer()

numeric_transformer = Pipeline(steps=[
    ('imputer',KNNImputer()),
    ('scaler',StandardScaler()) 
])
#StandardScaler()
# Create pipeline for categorical features

categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_indices),
        ('cat', categorical_transformer, categorical_indices)
    ])

# Preprocessings on train set
print("Performing preprocessings on train set...")
X_train = preprocessor.fit_transform(X_train)
print('preprocessings on train set...Done.')


# Preprocessings on test set
print("Performing preprocessings on test set...")
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('preprocessings on test set...Done.')

# **PART II : Models fitting**

In [None]:
#Instanciate models

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
import lightgbm as lgb

gbm = xgb.XGBClassifier()
rf = RandomForestClassifier()
et =  ExtraTreesClassifier()
ada = AdaBoostClassifier()
gb = GradientBoostingClassifier()
lr = LogisticRegression()
dt = DecisionTreeClassifier()
lgb = lgb.LGBMClassifier()
cb = CatBoostClassifier()

In [None]:
#from sklearn.metrics import classification_report #pas utilisé
classifier = []
for model in [gbm, rf, et,  ada, gb, lr, dt,lgb, cb]:
  

  model.fit(X_train, Y_train)
  Y_train_pred    = model.predict(X_train)
  Y_test_pred     = model.predict(X_test)
  score           = model.score(X_test, Y_test)
  

#for classification only

  accuracy_train  = accuracy_score(Y_train, Y_train_pred)
  accuracy_test   = accuracy_score(Y_test, Y_test_pred)
  f1_score_train  = f1_score(Y_train, Y_train_pred)
  f1_score_test   = f1_score(Y_test, Y_test_pred)

  print()
  print('model : ', model, ' - score : ', score, 'accuracy_train : ', accuracy_train, 'accuracy_test : ', accuracy_test)

  

  classifier.append({'classifier'     : model,
                    'score'           : score,
                    'accuracy_train'  : accuracy_train,
                    'accuracy_test'   : accuracy_test,
                    'f1_score_train'  : f1_score_train,
                    'f1_score_test'   : f1_score_test
                     }
                    )

In [None]:
all_scores = pd.DataFrame.from_dict(classifier)
print(all_scores.sort_values(by='f1_score_test', ascending=False)['classifier'].values[:3])
all_scores.sort_values(by='f1_score_test', ascending=False)

In [None]:
#There is a problem with the DataFrame display.
# We'll consider best model as random forrest

# Is it possible to lower over fitting on best model ?

In [None]:
from sklearn.model_selection import cross_val_score,GridSearchCV

In [None]:
model = RandomForestClassifier()
param_grid = { 
    'n_estimators': [200,250, 300, 500],
    'max_depth' : [15, 20],
    # 'criterion' :['gini', 'entropy'],
}
m2 = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)
m2.fit(X_train, Y_train)

In [None]:
m2.best_params_

In [None]:
model = RandomForestClassifier(max_depth= 15, 
                               n_estimators= 300)
classifier = []
model.fit(X_train, Y_train)
Y_train_pred    = model.predict(X_train)
Y_test_pred     = model.predict(X_test)
score           = model.score(X_test, Y_test)


#for classification only

accuracy_train  = accuracy_score(Y_train, Y_train_pred)
accuracy_test   = accuracy_score(Y_test, Y_test_pred)
f1_score_train  = f1_score(Y_train, Y_train_pred)
f1_score_test   = f1_score(Y_test, Y_test_pred)

print()
print('model : ', model, ' - score : ', score, 'accuracy_train : ', accuracy_train, 'accuracy_test : ', accuracy_test)



classifier.append({'classifier'     : model,
                    'score'           : score,           
                    'accuracy_test'   : accuracy_test,
                    'f1_score_test'   : f1_score_test
                     }
                    )
pd.DataFrame(data=classifier)

In [None]:
import pandas as pd
feat_importances = pd.Series(model.feature_importances_ ,index=df.loc[:, df.columns != target].columns).to_frame()
feat_importances.rename(columns={0: 'feature_importance'}, inplace=True)
feat_importances.sort_values(by='feature_importance', ascending=False).mul(100)

In [None]:
ax = feat_importances.sort_values(by='feature_importance', ascending=True).mul(100).round().plot(kind='barh', figsize=(12,8));
for c in ax.containers:
    ax.bar_label(c, label_type='center', color="white")