In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
import os

In [38]:
os.chdir("D://meridianthe4//PML//Cases//Glass_Identification")

In [39]:
glass = pd.read_csv("Glass.csv")
X, y = glass.drop('Type', axis=1), glass['Type']
le = LabelEncoder()
y = le.fit_transform(y)
lr = LogisticRegression(max_iter=5000)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(lr, X, y, cv=kfold, scoring='accuracy')

In [40]:
np.mean(results)

np.float64(0.6118493909191584)

In [41]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [42]:
results = cross_val_score(lr, X, y, cv=kfold, scoring='f1_macro')
np.mean(results)

np.float64(0.5623385723670353)

In [43]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(lr, X, y, cv=skfold, scoring='f1_macro')
np.mean(results)

np.float64(0.5419210898318403)

In [44]:
solvers = ['newton-cg', 'lbfgs', 'newton-cholesky', 'sag', 'saga']
Cs = np.linspace(0.001, 15, 20)
penalties = ['l2', None]
scores = []
for solver in tqdm(solvers):
    for C in Cs:
        for penalty in penalties:
            if penalty is None:
                lr = LogisticRegression(max_iter=50000, solver=solver, penalty=penalty)
                result = cross_val_score(lr, X, y, cv=kfold, scoring='f1_macro')
                scores.append([solver, C, penalty, np.mean(result)])
            else:
                lr = LogisticRegression(max_iter=50000, solver=solver, C=C, penalty=penalty)
                result = cross_val_score(lr, X, y, cv=kfold, scoring='f1_macro')
                scores.append([solver, C, penalty, np.mean(result)])
df_scores = pd.DataFrame(scores, columns=['solver', 'C', 'penalty', 'score'])
df_scores.sort_values(by='score', ascending=False)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:59<00:00, 11.95s/it]


Unnamed: 0,solver,C,penalty,score
4,newton-cg,1.579842,l2,0.585337
44,lbfgs,1.579842,l2,0.585337
124,sag,1.579842,l2,0.585337
84,newton-cholesky,1.579842,l2,0.585337
121,sag,0.001000,,0.581631
...,...,...,...,...
0,newton-cg,0.001000,l2,0.108327
40,lbfgs,0.001000,l2,0.108327
80,newton-cholesky,0.001000,l2,0.108327
120,sag,0.001000,l2,0.108327


In [45]:
depths = [None, 3, 4, 5, 6, 7]
min_samples = [2, 10, 0.025, 0.01, 0.05, 0.1]
min_leaf = [1, 10, 0.025, 0.01, 0.05, 0.1]
scores = []
for depth in tqdm(depths):
    for min_sample in min_samples:
        for leaf in min_leaf:
            dt = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_sample, min_samples_leaf=leaf)
            result = cross_val_score(dt, X, y, cv=kfold, scoring='f1_macro')
            scores.append([depth, min_sample, leaf, np.mean(result)])

100%|██████████| 6/6 [00:03<00:00,  1.83it/s]


In [46]:
df_scores = pd.DataFrame(scores, columns=['depth', 'min_samples_split', 'min_samples_leaf', 'score'])
df_scores.sort_values(by='score', ascending=False)

Unnamed: 0,depth,min_samples_split,min_samples_leaf,score
12,,0.025,1.00,0.705257
153,6.0,10.000,0.01,0.703877
24,,0.050,1.00,0.700855
27,,0.050,0.01,0.700747
144,6.0,2.000,1.00,0.699917
...,...,...,...,...
77,4.0,2.000,0.10,0.451427
101,4.0,0.050,0.10,0.451427
95,4.0,0.010,0.10,0.451427
89,4.0,0.025,0.10,0.451427


### Stratified K-Fold

In [47]:
solvers = ['newton-cg', 'lbfgs', 'sag', 'saga']
Cs = np.linspace(0.001, 15, 20)
penalties = ['l2', None]
scores = []
for solver in tqdm(solvers):
    for C in Cs:
        for penalty in penalties:
            if penalty is None:
                lr = LogisticRegression(max_iter=50000, solver=solver, penalty=penalty)
                result = cross_val_score(lr, X, y, cv=skfold, scoring='f1_macro')
                scores.append([solver, C, penalty, np.mean(result)])
            else:
                lr = LogisticRegression(max_iter=50000, solver=solver, C=C, penalty=penalty)
                result = cross_val_score(lr, X, y, cv=skfold, scoring='f1_macro')
                scores.append([solver, C, penalty, np.mean(result)])
df_scores = pd.DataFrame(scores, columns=['solver', 'C', 'penalty', 'score'])
df_scores.sort_values(by='score', ascending=False)

100%|██████████| 4/4 [01:02<00:00, 15.66s/it]


Unnamed: 0,solver,C,penalty,score
78,lbfgs,15.000000,l2,0.604381
38,newton-cg,15.000000,l2,0.604381
118,sag,15.000000,l2,0.583526
36,newton-cg,14.210579,l2,0.583526
158,saga,15.000000,l2,0.583526
...,...,...,...,...
84,sag,1.579842,l2,0.536016
0,newton-cg,0.001000,l2,0.096895
40,lbfgs,0.001000,l2,0.096895
80,sag,0.001000,l2,0.096895


In [48]:
depths = [None, 3, 4, 5, 6, 7]
min_samples = [2, 10, 0.025, 0.01, 0.05, 0.1]
min_leaf = [1, 10, 0.025, 0.01, 0.05, 0.1]
scores = []
for depth in tqdm(depths):
    for min_sample in min_samples:
        for leaf in min_leaf:
            dt = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_sample, min_samples_leaf=leaf)
            result = cross_val_score(dt, X, y, cv=skfold, scoring='f1_macro')
            scores.append([depth, min_sample, leaf, np.mean(result)])
df_scores = pd.DataFrame(scores, columns=['solver', 'C', 'penalty', 'score'])
df_scores.sort_values(by='score', ascending=False)

100%|██████████| 6/6 [00:03<00:00,  1.88it/s]


Unnamed: 0,solver,C,penalty,score
9,,10.000,0.01,0.664551
192,7.0,0.025,1.00,0.660030
183,7.0,2.000,0.01,0.659790
186,7.0,10.000,1.00,0.657055
135,5.0,0.050,0.01,0.655256
...,...,...,...,...
59,3.0,0.010,0.10,0.459169
47,3.0,10.000,0.10,0.459169
41,3.0,2.000,0.10,0.459169
71,3.0,0.100,0.10,0.459169
