# Classical Approach

In [22]:
import pandas as pd
from fetch import *
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer

## Load Data

In [7]:
df = fetch_data('function', 'mondo_names')

In [23]:
# Assuming df is your DataFrame
all_labels = [label for sublist in df['mondo_names'] for label in sublist]
label_counts = Counter(all_labels)
top_labels = [label for label, count in label_counts.most_common(10)]

mlb = MultiLabelBinarizer(classes=top_labels)
Y = mlb.fit_transform(df['mondo_names'])

# Convert 'function' column to a numeric format using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = tfidf_vectorizer.fit_transform(df['function'])

# Split data into training, validation, and testing sets
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)



## Model Training

In [26]:
# Define and train the model
model = OneVsRestClassifier(SVC(kernel='linear', probability=True, class_weight='balanced'), n_jobs=1)
model.fit(X_train, Y_train)

## Evaluation

In [27]:
# Predict the labels on the validation set
Y_pred = model.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(Y_val, Y_pred)
precision = precision_score(Y_val, Y_pred, average='samples')  # 'samples' gives a measure per sample
recall = recall_score(Y_val, Y_pred, average='samples')        # 'samples' or 'micro' could be used based on your specific needs
f1 = f1_score(Y_val, Y_pred, average='samples')                # Adjust 'average' as needed

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.4571013590621434
Precision: 0.6012692319726244
Recall: 0.6287879349270572
F1 Score: 0.5870945248597618


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
