In [2]:
import numpy as np
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn import svm
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import fasttext
import shap
import re


from src.dataset.llm_classifier_dataset import LLMClassifierDatabase

ModuleNotFoundError: No module named 'src'

In [None]:
#returns predictions on test data using LogReg model
def generateLogisticRegression(data, classes, predictMe):
    model = LogisticRegression(penalty='l2', max_iter = 250).fit(data, classes)
    return model.predict(predictMe)
    
#returns predictions on test data using Naive Bayes model
def generateNaiveBayes(data, classes, predictMe):
    model = GaussianNB().fit(data,classes)
    return model.predict(predictMe)
    
#prec, f1, recall, aucroc, returns all 4 in tuple
def runMetrics(predicted, actual):
    prec = sk.metrics.precision_score(actual, predicted)
    f1 = sk.metrics.f1_score(actual, predicted)
    recall = sk.metrics.recall_score(actual, predicted)
    aucroc = sk.metrics.roc_auc_score(actual, predicted)
    print("Precision score of the model:", prec)
    print("F1 score of the model", f1)
    print("Recall score of the model", recall)
    print("AUCROC of the model", aucroc)
    return (prec, f1, recall, aucroc)

#to test the iris data set, wont be used on real dataset
def runMetricsMulticlass(predicted, actual):
    prec = sk.metrics.precision_score(actual, predicted, average = 'micro')
    f1 = sk.metrics.f1_score(actual, predicted, average = 'micro')
    recall = sk.metrics.recall_score(actual, predicted, average = 'micro')
    print("Precision score of the model:", prec)
    print("F1 score of the model", f1)
    print("Recall score of the model", recall)
    return [prec, f1, recall]

In [3]:
model_path = 'C:/Users/alex/Documents/MLFolder/wiki-news-300d-1M.vec'
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False)

#data should be [[[sent vect][sent vect]],[list of sent in response],[list of sent in response],response,response], 
# where n is number of responses, X1 is number of sentences in response, X2 is word vector
def runfasttext(data):
    fasttextout = []#will need to be an array :*(
    for i in range(len(data)):
        fasttextout.append(re.split('.?!', data[i]))#split across sentences
        for j in range(len(fasttextout[i])):
            fasttextout[i][j] = model.infer_vector(fasttextout[i][j])
    return fasttextout
    
#data should be [[[sent vect][sent vect]],[list of sent in response],[list of sent in response],response,response], 
#where n is number of responses, X1 is number of sentences in response, X2 is word vector
#result of this should hopefully standardize the number of sent vect per response, 
#so the array is (n,max_seq_length,gensim.model.vector_size)
def padInput(data):   
    max_seq_length = max(len(seq) for seq in data)
    padData = np.array([])
    for i in range(len(data)):
        #amount of missing sentence vectors
        fixedColumn = np.array(data[i]).reshape(-1,len(data[i][0]))
        
        #we are padding j times, so that len(fixedColumn) = max_seq
        for j in range(max_seq_length - len(data[i])):
            fixedColumn = np.append(np.array(data[i]),np.zeros(gensim.model.vector_size)).reshape(-1,gensim.model.vector_size)
        padData = np.append(padData, fixedColumn).reshape(-1,len(fixedColumn),gensim.model.vector_size)
    return padData


dbdata = LLMClassifierDataset(db_path="path/to/database/file.db", load_to_memory=True, vectorize=True)
dbdata = dbdata.tolist()
allData = np.array()
allLabels = np.array()
for i in range(len(dbdata)):
    allData = np.append(allData,dbdata[i][0])
    allLabels = np.append(allLabels,dbdata[i][1])
    
#this should make a marco and llm dataset
#will have N elements, each element will be the prompt string with the answer strong attatched at the end

allData = fasttext(allData)
allData = padInput(allData)
p = np.random.permutation(len(a))
allData = allData[p]
allLabels = allLabels[p]
trainData = allData[0:4*len(allData)/5]
trainLabels = allLabels[0:4*len(allData)/5]
testData = allData[4*len(allData)/5:len(allData)]
testLabels = allLabels[4*len(allData)/5:len(allData)]

predClassLog = generateLogisticRegression(trainData,trainLabels,testData)
predClassBayes = generateNaiveBayes(trainData,trainLabels,testData)
runMetrics(predClassLog,testLabels)
runMetrics(predClassBayes,testLabels)

NameError: name 'LLMClassifierDataset' is not defined

In [9]:
testing = np.array([[[2,7],[9,1],[1,3]],[[5,6],[6,1],[1,1]],[[1,3],[2,4],[4,8]],[[2,10],[9,11],[11,3]]])
besting = np.array([])
for i in range(len(testing)):
    fixedColumn = np.array(testing[i]).reshape(-1,len(testing[i][0]))
    fixedColumn = np.append(fixedColumn,np.array([0,0])).reshape(-1,len(testing[0][0]))
    # print(fixedColumn)
    # print("-------")
    besting = np.append(besting, fixedColumn).reshape(-1,len(fixedColumn),len(testing[0][0]))

print(testing[0:2])
print("")
print(testing[2:4])


[[[2 7]
  [9 1]
  [1 3]]

 [[5 6]
  [6 1]
  [1 1]]]

[[[ 1  3]
  [ 2  4]
  [ 4  8]]

 [[ 2 10]
  [ 9 11]
  [11  3]]]


In [None]:
#Iris data to make sure models do stuff
xData, yData = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(xData, yData)#once data is built

(X_train, X_test) = padInput(X_train, X_test)

In [None]:
#predict, then run metrics
predsLogReg = generateNaiveBayes(X_train, y_train, X_test)
predsBayes = generateLogisticRegression(X_train, y_train, X_test)
print(predsLogReg)
print(predsBayes)
print(y_test)
runMetricsMulticlass(predsLogReg, y_test)
runMetricsMulticlass(predsBayes, y_test)

In [None]:
#Creating custom data set with 2 classes to test shap values. Naive bayes and log reg will struggle as 
#this dataset is completlty random :)

n = 1000
genData = np.random.rand(n, 8)
genClasses = (np.random.rand(n,)+.5).astype(int)/1

X_gen_train, X_gen_test, y_gen_train, y_gen_test = train_test_split(genData, genClasses)

predsGenLogReg = generateNaiveBayes(X_gen_train, y_gen_train, X_gen_test)
predsGenBayes = generateLogisticRegression(X_gen_train, y_gen_train, X_gen_test)

runMetrics(predsGenLogReg, y_gen_test)#should preform poorly since the data is randomized
print("")
runMetrics(predsGenBayes, y_gen_test)#should preform poorly since the data is randomized

In [None]:
logRegModel = LogisticRegression(penalty='l2', max_iter = 250).fit(X_gen_train, y_gen_train)
explainer = shap.LinearExplainer(logRegModel, X_gen_train)
shap_values = explainer.shap_values(X_gen_train)
bapValues = explainer(X_gen_test)
    
shap.initjs()
shap.plots.beeswarm(bapValues) #doesn't seem to work with multiclass data. Shouldn't be a problem for our application

In [None]:
marcoData = np.random.rand(500, 4) + .3 #placeholder. Will be replaced with vectorized marco responses
LLMData = np.random.rand(500, 4)#placeholder. Will be replaced with vectorized LLM resposnes

#------
#wrangling of MARCO data will be here

#------
#wrangling of LLM data will be here

marcoLabels = np.zeros(len(marcoData))
LLMLabels = np.ones(len(LLMData))

realModel = (np.append(marcoData, LLMData)).reshape(len(marcoData)+len(LLMData), len(marcoData[0]))
realClasses = np.append(marcoLabels, LLMLabels)

p = np.random.permutation(len(realModel)) #give it a good shuffle :D
realModel = realModel[p]
realClasses = realClasses[p]

X_response_train, X_response_test, y_origin_train, y_origin_test = train_test_split(realModel, realClasses)

predictedLogReg = generateLogisticRegression(X_response_train,y_origin_train,X_response_test)
predictedBayes = generateNaiveBayes(X_response_train,y_origin_train,X_response_test)

#should be able to distinguish them decently well. Some data overlap will prevent it from getting super high
#but it should realize higher values = fake marco dataset, lower values = fake llm dataset
print("logistic metrics on real dataset:")
runMetrics(y_origin_test,predictedLogReg)
print("\naive bayes on real dataset:")
runMetrics(y_origin_test,predictedBayes)
