In [None]:
import pickle
import os
import pandas as pd
from datasets import Dataset
import json
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.metrics import confusion_matrix, classification_report

def import_data(importDirectory, train_or_valid, spacy_tokenizer_name):
    export_list = []
    for file in os.listdir(importDirectory):
        filenameArr = file.split('.')
        filenameArr2 = file.split('_')
        if(len(filenameArr) > 1):
            if(filenameArr2[0] == train_or_valid):
                if(spacy_tokenizer_name in file):
                    if(filenameArr[1] == 'pkl'):
                        with open(importDirectory+'/'+file, 'rb') as f:
                            data = pickle.load(f)
                            f.close()
                            export_list.append(data)
    return(export_list)

In [None]:
import_path = #should equal the path to the folder containing the output of BRAT_Parser
#Data is a list with a length equal to the amount of folders that were iterated through 
#data[n] is a list of length x. X is the number of files presnet in the nth folder. 
#data[n][x] is a dictionary, with keys 'ADE_strings', 'noADE_strings', 'num_Multi_Token_ADE_Relatons


#Will work with data via Hugging Face Datasets library
#Should be a dataset dict: {"train": ["string", "label", "idx"], "verification":["string", "label", "idx"], "test":[...
#First, cast it to a pandas df of shape ["label"]["string" ....] 
def load_data(import_path, train_or_valid, spacy_tokenizer_name):
    data = import_data(import_path, train_or_valid, spacy_tokenizer_name)
    
    dataset = pd.DataFrame({})
    ADE_Strings = [] 
    noADE_Strings = []
    
 
    for i in data:
        for j in i[0]:
            #temp = j[0]
            for k in j.get('ADE_strings'):
                ADE_Strings.append(k.get('string'))
            for k in j.get('noADE_strings'):
                noADE_Strings.append(k.get('string'))
    dataset['string'] = ADE_Strings + noADE_Strings
    #Set to 1 because they are ADE
    dataset.loc[0:len(ADE_Strings)-1, ('label')] = 1
    #Set rest to 0 because they are not ADE
    dataset.loc[len(ADE_Strings):len(dataset['string']), ('label')] = 0
    return(dataset)

#note: train2/dev2 names represent files that only treat the Problem events in ADE tags as ADE sentences, not both Problem AND Drug

train_dataframe = load_data(import_path, 'train3', "en_core_sci_md")
train_dataframe['label'] = train_dataframe['label'].astype(int)

valid_dataframe = load_data(import_path, 'test3', "en_core_sci_md")
#Without this line, will result in dimesnioanl mismatch: https://discuss.huggingface.co/t/valueerror-target-size-torch-size-8-must-be-the-same-as-input-size-torch-size-8-8/12133/2
valid_dataframe['label'] = valid_dataframe['label'].astype(int)


train_dataframe.rename(columns={'string':'sentence'}, inplace=True)
valid_dataframe.rename(columns={'string':'sentence'}, inplace=True)


#print(type(train_dataframe['label'][0]))
display(train_dataframe)
display(valid_dataframe)

print("Total length of train dataframe:")
print(len(train_dataframe))
print("Length of class 1, ADE")
print(train_dataframe['label'].value_counts().get(1))
print("Length of  class 2, noADE")
print(train_dataframe['label'].value_counts().get(0))


print("Total length of dev dataframe:")
print(len(valid_dataframe))
print("Length of class 1, ADE")
print(valid_dataframe['label'].value_counts().get(1))
print("Length of  class 2, noADE")
print(valid_dataframe['label'].value_counts().get(0))

print("\n\nADE Example")
print(train_dataframe.iloc[1, 0])
print('--------')
print("noADE Example")
print(train_dataframe.iloc[16244, 0])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

train_data = train_dataframe
test_data = valid_dataframe

# Extract features from text using TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['sentence'])
X_test = vectorizer.transform(test_data['sentence'])

# Get labels
y_train = train_data['label']
y_test = test_data['label']

In [None]:
#c = 10, balanced = default, f1 = .49
# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='sigmoid', C=10, class_weight={0:1, 1:5})  # You can choose different kernels based on your data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = svm_classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, predictions)
classification_report_result = classification_report(y_test, predictions, digits=3)

print("SIGMOID")
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report_result)


In [None]:
def export_data(data, path):
    f = open(path+".pkl",'w')
    f.close()
    f = open(path+".pkl", "wb")
    pickle.dump(data, f)
    f.close()

export_data({'predictions':predictions, 'ground_truth':y_test}, "SVM_predictions")