# Les imports

In [1]:
import pandas as pd
import numpy as np
import sklearn 
import math
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Présenetation du dataset

## Les colonnes

* Le dataset a 10 attributs numériques et 8 attributs catégoriels:
 * les 10 attributs numériques: 
    * `BounceRates`
    * `ExitRates`
    * `PageValues`
    * `SpecialDay`
    * `Month`
    * `OperatingSystems`
    * `Browser`
    * `Region`
    * `TrafficType`
    * `VisitorType`
  * 8 attributs catégoriels:
      * `Revenue`: Étiquette de classe
      * `Administratif` : Nombre de pages de type Administratif visitées par l'utilisateur
      * `Administrative_Duration`: Le temps total passé sur les pages Administratif
      * `Information` : Nombre de pages de type Information visitées par l'utilisateur
      * `Informational_Duration` : Le temps total passé sur les pages Information
      * `ProductRelated`: Nombre de pages de type ProductRelated visitées par l'utilisateur
      * `ProductRelated_Duration` : Le temps total passé sur les pages ProductRelated
      * `Weekend` : 

## recupération du dataset

In [2]:
df = pd.read_csv("./online_shoppers_intention.csv")

FileNotFoundError: [Errno 2] No such file or directory: './online_shoppers_intention.csv'


## Infos du dataframe


In [None]:
df.info()

## Premières ligne

In [None]:
df.head()


In [None]:
df.describe()

# Nettoyage des données

## Le nombre de nan par colonne

In [None]:
df.isnull().sum()

## Exemple avec la colonne Administrative

In [None]:
val_nan = []
for i in df['Administrative']:
    if(math.isnan(i)):
        val_nan.append(i)
        
print('Il y a :'.format(len(val_nan)),len(val_nan))
val_nan

## Valeurs manquantes: `Administrative`, `Informational`, `ProductRelated`

### Processus :

* Remplacer les 0 par none
* Puis remplace des none par la `médiane`

### Nombre de none dans les 3 colonnes

In [None]:
val_nominatif = ['Administrative','Informational','ProductRelated']
cel_not_null = [df[i].isnull().sum() for i in val_nominatif]
cel_not_null
df.Administrative.dtypes

### Remplacement des zéro par none

In [None]:
[df[i].replace(0.0, np.nan, inplace= True) for i in val_nominatif ]

In [None]:
[df[i].isnull().sum() for i in val_nominatif]

### On remplace toutes les none par la médiane

In [None]:
for cols in val_nominatif:
    median_value=df[cols].median()
    print(median_value)
    df[cols]=df[cols].fillna(median_value)
    
[df[i].isnull().sum() for i in val_nominatif]

### On vérifie qu'ils existent des valeurs -1 sur la colonne Informational_Duration

In [None]:
list_val = [i for i in df.Informational_Duration if i == -1]
len(list_val)

### On remplace les valeurs -1 par none dans val_imputer

In [None]:
val_imputer = ['Administrative_Duration','Informational_Duration','ProductRelated_Duration','BounceRates','ExitRates']

for i in val_imputer:
    for j in df[i]:
        if( j == -1):
            df[i].replace(j, np.nan, inplace= True)


### On teste pour Informational_Duration

In [None]:
df.Informational_Duration

### On compte les none

In [None]:
[df[i].isnull().sum() for i in val_imputer]

### Rremplace les none par la moyenne des colonnes de la liste val_imputer

In [None]:
for i in val_imputer:
    mean_value=df[i].mean()
    df[i]=df[cols].fillna(mean_value)

### On vérifie qu'il n y a plus de valeur nulle.

In [None]:
[df[i].isnull().sum() for i in val_imputer]   

# Description du dataframe aprés nettoyage

In [None]:
df.describe()

# Visualisation

## Bounce & Exit Rates Vs Revenue

In [None]:
sns.relplot(x="BounceRates", y="ExitRates",col="Revenue",hue="Revenue",style="Weekend", data=df)

## Visitor type & Exit Rates Vs Revenue

In [None]:
sns.catplot(x="VisitorType", y="ExitRates", hue="Weekend", col="Revenue", data=df, kind="box");

## Carte de corrélation

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15,10))

sns.heatmap(df.corr(),annot=True,fmt='.1g', linewidths=.5, ax=ax)

## Mise à l'échelle

In [None]:
Categorical_variables=['Weekend','Revenue','Administrative','Informational','ProductRelated','SpecialDay',
 'OperatingSystems','Browser','Region','Month','TrafficType','VisitorType']

scale=[feature for feature in df.columns if feature not in Categorical_variables]

scaler=StandardScaler()
scaler.fit(df[scale])

### Labelisation

In [None]:
from sklearn.preprocessing import LabelEncoder
encoded_features=['Month','VisitorType']

label_encoder = LabelEncoder()
for col in encoded_features:
    df[col] = label_encoder.fit_transform(df[col])
    
df.head()

### Train et test 

In [None]:
from sklearn.model_selection import train_test_split

X=df.drop(['SpecialDay','VisitorType','Weekend','Revenue'],axis=1)
y=df.Revenue

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=1)
X_train.shape, X_test.shape ,y_train.shape, y_test.shape

### Selection du bon modél cross-validation

In [None]:
from sklearn.model_selection import cross_val_score

logreg_cv = LogisticRegression(random_state=0, solver='lbfgs', max_iter=10000)
dt_cv=DecisionTreeClassifier()
rt_cv=RandomForestClassifier()
knn_cv=KNeighborsClassifier()

cv_models = dict(Logistic_Regression = logreg_cv, Decision_Tree = dt_cv, RandomForest = rt_cv, KNN = knn_cv)
scores = [key + ": " + str(cross_val_score(value, X, y, cv=10, scoring ='accuracy').mean()*100) + '%' for key, value in cv_models.items()]
scores

In [None]:
max(scores)