In [38]:
import pandas as pd
from openai.embeddings_utils import plot_multiclass_precision_recall
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import Lasso, RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from ast import literal_eval
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [39]:
import sys

if not sys.warnoptions:
    import warnings

    warnings.simplefilter("ignore", category=UserWarning)
    warnings.simplefilter("ignore", category=FutureWarning)

In [40]:
data = pd.read_csv("input/gpt.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       104 non-null    object 
 1   cluster    104 non-null    int64  
 2   number     104 non-null    float64
 3   hours      104 non-null    float64
 4   prefix     104 non-null    object 
 5   combined   104 non-null    object 
 6   n_tokens   104 non-null    int64  
 7   embedding  104 non-null    object 
dtypes: float64(2), int64(2), object(4)
memory usage: 6.6+ KB


In [41]:
data["embedding"] = data.embedding.apply(literal_eval).apply(np.array)

In [42]:
# remove all clusters that have only one member
data = data.groupby("cluster").filter(lambda x: len(x) > 1)

data.groupby("cluster").count().name.sum()

76

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    list(data.embedding.values),
    data.cluster,
    test_size=0.2,
    random_state=42,
    stratify=data.cluster.values,
)

# train random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds, zero_division=0)
print(report)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         1
           6       0.67      1.00      0.80         2
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         2
           9       0.50      1.00      0.67         1
          10       1.00      0.50      0.67         2
          26       1.00      1.00      1.00         1
          43       0.00      0.00      0.00         1

    accuracy                           0.81        16
   macro avg       0.76      0.79      0.76        16
weighted avg       0.80      0.81      0.79        16



In [44]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred, zero_division=0)
print(report)

Accuracy: 0.25
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.67      1.00      0.80         2
           5       1.00      1.00      1.00         1
           6       1.00      0.50      0.67         2
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         2
          26       0.00      0.00      0.00         1
          43       0.00      0.00      0.00         1

    accuracy                           0.25        16
   macro avg       0.22      0.21      0.21        16
weighted avg       0.27      0.25      0.25        16



In [45]:
clf = SVC(kernel="linear", C=1.0, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred, zero_division=0)
print(report)

Accuracy: 0.38
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.20      1.00      0.33         2
           5       0.00      0.00      0.00         1
           6       1.00      1.00      1.00         2
           7       0.00      0.00      0.00         1
           8       0.50      1.00      0.67         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         2
          26       0.00      0.00      0.00         1
          43       0.00      0.00      0.00         1

    accuracy                           0.38        16
   macro avg       0.14      0.25      0.17        16
weighted avg       0.21      0.38      0.25        16



In [46]:
clf = KNeighborsClassifier(n_neighbors=int(np.sqrt(len(X_train))))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred, zero_division=0)
print(report)

Accuracy: 0.75
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.50      1.00      0.67         1
           3       0.50      1.00      0.67         1
           4       1.00      0.50      0.67         2
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         2
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         2
           9       0.50      1.00      0.67         1
          10       1.00      0.50      0.67         2
          26       1.00      1.00      1.00         1
          43       0.00      0.00      0.00         1

    accuracy                           0.75        16
   macro avg       0.71      0.75      0.69        16
weighted avg       0.78      0.75      0.73        16



In [47]:
clf = LogisticRegression(random_state=0, solver="lbfgs", multi_class="multinomial")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred, zero_division=0)
print(report)

Accuracy: 0.375
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.40      1.00      0.57         2
           5       0.00      0.00      0.00         1
           6       0.67      1.00      0.80         2
           7       0.00      0.00      0.00         1
           8       0.25      1.00      0.40         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         2
          26       0.00      0.00      0.00         1
          43       0.00      0.00      0.00         1

    accuracy                           0.38        16
   macro avg       0.11      0.25      0.15        16
weighted avg       0.16      0.38      0.22        16



In [48]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred, zero_division=0)
print(report)

Accuracy: 0.8125
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.50      1.00      0.67         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         2
           7       1.00      1.00      1.00         1
           8       1.00      0.50      0.67         2
           9       0.50      1.00      0.67         1
          10       1.00      0.50      0.67         2
          26       1.00      1.00      1.00         1
          27       0.00      0.00      0.00         0
          43       1.00      1.00      1.00         1

    accuracy                           0.81        16
   macro avg       0.77      0.77      0.74        16
weighted avg       0.88      0.81      0.81        16



In [56]:
knn_pipe = Pipeline(
    [
        ("knn", KNeighborsClassifier()),
    ]
)

ada_pipe = Pipeline([("ada", AdaBoostClassifier())])

boost_pipe = Pipeline([("boost", GradientBoostingClassifier())])

lasso_pipe = Pipeline([("lasso", LogisticRegression())])

ridge_pipe = Pipeline([("ridge", RidgeClassifier())])

linear_pipe = Pipeline(
    [
        ("linear", LinearDiscriminantAnalysis()),
    ]
)
svm_pipe = Pipeline(
    [
        ("svm", SVC()),
    ]
)
logistic_pipe = Pipeline(
    [
        ("logistic", LogisticRegression()),
    ]
)
decision_pipe = Pipeline(
    [
        ("decision", DecisionTreeClassifier()),
    ]
)
random_pipe = Pipeline([("random", RandomForestClassifier())])

knn_params = {
    "knn__n_neighbors": range(4, int(np.sqrt(len(data)))),
    "knn__weights": ["uniform", "distance"],
    "knn__leaf_size": [15, 20],
    "knn__algorithm": ["ball_tree", "kd_tree", "brute", "auto"],
}

ada_params = {
    "ada__n_estimators": range(10, 100, 10),
    "ada__learning_rate": np.linspace(0.1, 1.1, 11),
    "ada__algorithm": ["SAMME", "SAMME.R"],
}

boost_params = {
    "boost__n_estimators": range(10, 100, 10),
    "boost__learning_rate": np.linspace(0, 1, 11),
    "boost__loss": ["log_loss", "exponential"],
    "boost__criterion": ["friedman_mse", "squared_error"],
    "boost__max_features": ["sqrt", "log2"],
}

lasso_params = {
    "lasso__penalty": ["l1"],
    "lasso__C": np.linspace(0.1, 1.1, 11),
    "lasso__solver": [
        "liblinear",
        "saga",
    ],
}

ridge_params = {
    "ridge__alpha": np.linspace(0.1, 1.1, 11),
    "ridge__solver": [
        "auto",
        "svd",
        "cholesky",
        "lsqr",
        "sparse_cg",
        "sag",
        "saga",
    ],
}

linear_params = {
    "linear__solver": ["svd", "lsqr", "eigen"],
    "linear__shrinkage": ["auto", None],
}

logistic_params = {
    "logistic__penalty": ["l2", None],
    "logistic__C": np.linspace(0.1, 1.1, 11),
    "logistic__solver": [
        "liblinear",
        "newton-cg",
        "newton-cholesky",
        "sag",
        "saga",
    ],
}

svm_params = {
    "svm__kernel": ["linear", "poly", "rbf", "sigmoid"],
    "svm__degree": [2, 3, 4],
    "svm__gamma": ["scale", "auto"],
    "svm__C": [0.1, 0.5, 1.0],
    "svm__probability": [True],
}

decision_params = {
    "decision__criterion": ["gini", "entropy", "log_loss"],
    "decision__splitter": ["best", "random"],
    "decision__max_features": ["sqrt", "log2"],
}

random_params = {
    "random__n_estimators": range(10, 100, 10),
    "random__criterion": ["gini", "entropy", "log_loss"],
    "random__max_features": ["sqrt", "log2"],
}

params = [
    knn_params,
    ada_params,
    boost_params,
    linear_params,
    svm_params,
    logistic_params,
    decision_params,
    random_params,
    ridge_params,
    lasso_params,
]

pipeline = [
    knn_pipe,
    ada_pipe,
    boost_pipe,
    linear_pipe,
    svm_pipe,
    logistic_pipe,
    decision_pipe,
    random_pipe,
    ridge_pipe,
    lasso_pipe,
]

# for each pipeline, perform grid search
for pipe, param in zip(pipeline, params):
    grid = GridSearchCV(pipe, param, cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    name = list(pipe.named_steps.keys())[0]
    pred = grid.predict(X_test)

    report = classification_report(y_test, pred, zero_division=0)

    with open("logs/supervised.log", "a") as f:
        f.write(
            f"\nName: {name} \nScore: {grid.score(X_test, y_test)} \nBest Params: {grid.best_params_} \n"
        )
        f.write(report)
        f.write("\n\n")

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 198 candidates, totalling 990 fits
Fitting 5 folds for each of 792 candidates, totalling 3960 fits


1980 fits failed out of a total of 3960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1584 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/dpapp/Projects/emse-mms/emse-mms/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/dpapp/Projects/emse-mms/emse-mms/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/dpapp/Projects/emse-mms/emse-mms/venv/lib/python3.10/site-packages/sklearn/ensemble/_gb.py", line 444, in fit
    self._check_params()
  File "/Users/dpapp/Projects/emse-mms/emse-mms

Fitting 5 folds for each of 6 candidates, totalling 30 fits


10 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/dpapp/Projects/emse-mms/emse-mms/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/dpapp/Projects/emse-mms/emse-mms/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/dpapp/Projects/emse-mms/emse-mms/venv/lib/python3.10/site-packages/sklearn/discriminant_analysis.py", line 615, in fit
    raise NotImplementedError("shrinkage not supported with 'svd' solv

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 110 candidates, totalling 550 fits


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=3.10264e-24): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.39447e-25): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=4.89127e-24): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remed

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 77 candidates, totalling 385 fits
Fitting 5 folds for each of 22 candidates, totalling 110 fits
