In [2]:
from ucimlrepo import fetch_ucirepo 
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
import numpy as np
import featurize as ft
from functools import partial

## Download data

In [8]:
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
data = glass_identification.data.features 
target = glass_identification.data.targets["Type_of_glass"]

data.head(), data.shape

(        RI     Na    Mg    Al     Si     K    Ca   Ba   Fe
 0  1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.0  0.0
 1  1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.0  0.0
 2  1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.0  0.0
 3  1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.0  0.0
 4  1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.0  0.0,
 (214, 9))

In [9]:
target.value_counts()

Type_of_glass
2    76
1    70
7    29
3    17
5    13
6     9
Name: count, dtype: int64

## Baseline model

In [23]:
N_SPLITS = 3
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)
for idx, (train_idx, test_idx) in enumerate(strat_kf.split(data, target)):
    X_train, X_test = data.iloc[train_idx], data.iloc[test_idx]
    y_train, y_test = target[train_idx], target[test_idx]

    clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm="brute")
    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)
    loss = accuracy_score(y_test, preds)
    scores[idx] = loss

print(scores.mean())

0.6356937923839333


## Featurize

In [10]:
# feats = ft.featurize(
#     data,
#     target,
#     problem_type="classification",
#     feature_depth=1,
#     mrmr_k=75,
#     swarm_particles=50,
#     swarm_iters=100,
# )

f = partial(ft.cost_funcs.classification.knn_accuracy, X=data, y=target)

ga = ft.selection.GeneticAlgorithm(
    cost_func=f, num_individuals=50, num_features=data.shape[1], max_iters=100
)

cost, genome = ga.optimize()

feats = data[data.columns[genome == 1]]

print(f"Best cost: {cost}, total features: {genome.sum()}")

Optimising feature space...:   0%|          | 0/100 [00:00<?, ?it/s]

Optimising feature space...: 100%|██████████| 100/100 [00:34<00:00,  2.89it/s]

Best cost: 0.32700834637454357, total features: 4





In [12]:
N_SPLITS = 3
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)
for idx, (train_idx, test_idx) in enumerate(strat_kf.split(feats, target)):
    X_train, X_test = feats.iloc[train_idx], feats.iloc[test_idx]
    y_train, y_test = target[train_idx], target[test_idx]

    clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', p=2, leaf_size=30, algorithm="brute")
    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)
    loss = accuracy_score(y_test, preds)
    scores[idx] = loss

print(scores.mean())

0.6729916536254564


In [13]:
feats = ft.featurize(
    data,
    target,
    problem_type="classification",
    feature_depth=1,
    mrmr_k=50,
    swarm_particles=50,
    swarm_iters=100,
)

f = partial(ft.cost_funcs.classification.knn_accuracy, X=data, y=target)

ga = ft.selection.GeneticAlgorithm(
    cost_func=f, num_individuals=50, num_features=data.shape[1], max_iters=50
)

cost, genome = ga.optimize()

feats = data[data.columns[genome == 1]]

print(f"Best cost: {cost}, total features: {genome.sum()}")

INFO:featurize.logging:Checking arguments to featurize function are in within acceptable bounds
INFO:featurize.logging:Inferring initial dataframe schema
INFO:featurize.logging:Featurizing dataframe at depth 1
INFO:featurize.logging:Adding numerical features
INFO:featurize.logging:Removed 0 zero variance columns
INFO:featurize.logging:Adding combination features
INFO:featurize.logging:Removed 9 zero variance columns
INFO:featurize.logging:Infering schema
INFO:featurize.logging:Selecting features using MRMR algorithm
INFO:featurize.logging:Initializing MaxRelevanceMinRedundancy class
INFO:featurize.logging:Fitting and transforming the data using the selected features
INFO:featurize.logging:Fitting mrmr algorithm to the data
INFO:featurize.logging:Setting mrmr k to 50
Pruning feature space...:   0%|          | 0/50 [00:00<?, ?it/s]INFO:featurize.logging:Starting feature pruning with 6552 features
Pruning feature space...: 100%|██████████| 50/50 [00:12<00:00,  4.12it/s]INFO:featurize.logg

Best cost: 0.32700834637454357, total features: 4





In [24]:
N_SPLITS = 3
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)
for idx, (train_idx, test_idx) in enumerate(strat_kf.split(feats, target)):
    X_train, X_test = feats.iloc[train_idx], feats.iloc[test_idx]
    y_train, y_test = target[train_idx], target[test_idx]

    clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm="brute")
    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)
    loss = accuracy_score(y_test, preds)
    scores[idx] = loss

print(scores.mean())

0.6729916536254564


SyntaxError: invalid syntax (2444230500.py, line 1)