In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [2]:
dataset  = pd.read_csv("Social_Network_Ads.csv")

In [3]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
dataset = dataset.drop(columns="User ID")

In [5]:
dataset.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [6]:
dataset.columns

Index(['Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [7]:
independent = dataset[['Age', 'EstimatedSalary', 'Gender']]
dependent = dataset[['Purchased']].values.ravel()

In [8]:
from sklearn.model_selection import train_test_split


In [9]:
X_train, X_test, y_train, y_test = train_test_split(independent,dependent,test_size=0.3,random_state=42)

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [11]:
preprocessor = ColumnTransformer([
('onehot',OneHotEncoder(drop='first'),['Gender'])
],remainder=StandardScaler()
)

In [12]:
pipeline = Pipeline([
('preprocess',preprocessor),
('dt',DecisionTreeClassifier(random_state=42))
]
)

In [13]:
param_grid = {
    "dt__criterion":['gini','entropy'] ,
    "dt__splitter":['best','random'],
    "dt__max_features":[None,'sqrt','log2'],
    "dt__class_weight":[None,'balanced']
}

In [14]:
model = GridSearchCV(estimator=pipeline,param_grid=param_grid,n_jobs=-1,refit=True,verbose=3)

In [15]:
model.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [16]:
model.best_params_


{'dt__class_weight': None,
 'dt__criterion': 'gini',
 'dt__max_features': None,
 'dt__splitter': 'best'}

In [17]:
y_pred  = model.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix,classification_report

In [19]:
confusion_matrix(y_test,y_pred)

array([[64,  9],
       [ 9, 38]], dtype=int64)

In [42]:
report = classification_report(y_test,y_pred)

In [44]:
print(report)

              precision    recall  f1-score   support

           0       0.88      0.88      0.88        73
           1       0.81      0.81      0.81        47

    accuracy                           0.85       120
   macro avg       0.84      0.84      0.84       120
weighted avg       0.85      0.85      0.85       120



In [48]:
import json

In [60]:
report_dt = classification_report(y_test,y_pred,output_dict=True)
with(open("DT_report.json","w")) as f:
    json.dump(report_dt,f)