Construct decision tree from calculations

In [3]:
h_df_filename_dfa = 'h_df_dfa.csv'
h_df_filename_afa = 'h_df_afa.csv'
h_df_filename_rra = 'h_df_rra.csv'

In [4]:
import os

def load_file(filename):
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        return df
    return None

In [5]:
import pandas as pd

def transform_df_features(df):
    features = []
    labels = []
    
    for col in df.columns:
        for value in df[col]:
            features.append([value])
            labels.append(col)

    features_df = pd.DataFrame(features, columns=['value'])
    labels_df = pd.Series(labels, name='label')

    return features_df, labels_df

In [6]:
from sklearn.preprocessing import LabelEncoder

def column_enc(labels_df):
    label_encoder = LabelEncoder()
    numeric_labels = label_encoder.fit_transform(labels_df)
    return label_encoder, numeric_labels

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

def construct_svm(features_df, numeric_labels):
    X_train, X_test, y_train, y_test = train_test_split(features_df, numeric_labels, test_size=0.3, random_state=0)

    ovo_clf = OneVsOneClassifier(SVC(gamma='scale',kernel = 'rbf', C = 8.0, random_state=0))

    pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf'))
    ])

    ovo_classifier = OneVsOneClassifier(pipeline)

    param_grid = {
        'estimator__svc__C': [0.01, 0.05, 0.1, 0.5, 1],
        'estimator__svc__gamma': [1e-3, 1e-2, 1e-1, 1]
    }

    grid_search = GridSearchCV(ovo_classifier, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")

    best_ovo_classifier = grid_search.best_estimator_
    y_pred = best_ovo_classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    
    return best_ovo_classifier, X_test, y_test

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

def construct_dt(features_df, numeric_labels):
    X_train, X_test, y_train, y_test = train_test_split(features_df, numeric_labels, test_size=0.3, random_state=0)

    dt = DecisionTreeClassifier(random_state=42)

    ovo_classifier = OneVsOneClassifier(dt)

    param_grid = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': [None, 10, 20],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 5],
    'estimator__max_features': ['sqrt', 'log2'],
      }

    grid_search = GridSearchCV(ovo_classifier, param_grid, cv=5, verbose=1)
    grid_search.fit(X_train, y_train)
    
    print("Best parameters found by GridSearchCV:")
    print(grid_search.best_params_)
    print()

    best_ovo_classifier = grid_search.best_estimator_
    y_pred = best_ovo_classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    
    return ovo_clf, X_test, y_test

DFA

In [33]:
df = load_file(h_df_filename_dfa)

df.dropna()

features_df, labels_df = transform_df_features(df)
label_encoder, numeric_labels = column_enc(labels_df)
ovo_clf, X_test, y_test = construct_dt(features_df, numeric_labels)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters found by GridSearchCV:
{'estimator__criterion': 'entropy', 'estimator__max_depth': 10, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 5}

Accuracy: 0.13


AFA

In [34]:
df = load_file(h_df_filename_afa)

df.dropna()

features_df, labels_df = transform_df_features(df)
label_encoder, numeric_labels = column_enc(labels_df)
ovo_clf, X_test, y_test = construct_dt(features_df, numeric_labels)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters found by GridSearchCV:
{'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 2, 'estimator__min_samples_split': 5}

Accuracy: 0.13


RRA

In [35]:
df = load_file(h_df_filename_rra)

df = df.dropna()

print(df.head())

features_df, labels_df = transform_df_features(df)
label_encoder, numeric_labels = column_enc(labels_df)
ovo_clf, X_test, y_test = construct_dt(features_df, numeric_labels)

   pos_gpt3  wiki_pos_gpt3  pos_gpt4  wiki_pos_gpt4  pos_orca  wiki_pos_orca  \
0  0.568857       0.625814  0.648996       0.445265  0.577301       0.653786   
1  0.513762       0.454937  0.585142       0.487270  0.706383       0.619046   
2  0.625112       0.475646  0.616498       0.551816  0.608498       0.596052   
3  0.542602       0.506067  0.631895       0.608186  0.517663       0.553370   
5  0.599746       0.534298  0.529228       0.554712  0.558618       0.593527   

   pos_falcon  wiki_pos_falcon  pos_llama  wiki_pos_llama  pos_llama3  \
0    0.574087         0.670758   0.462178        0.533370    0.615364   
1    0.644570         0.590531   0.619429        0.660133    0.505824   
2    0.632380         0.579613   0.614589        0.679210    0.525707   
3    0.584186         0.622216   0.651382        0.449492    0.549626   
5    0.503855         0.580111   0.578636        0.475438    0.539931   

   wiki_pos_llama3  text_pos  bart_pos  gensim_pos  
0         0.642561  0.57415