# SETUP

In [1]:
from requests import get
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from IPython.display import HTML
import pyodbc
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from pandas.io.json import json_normalize
import time

In [3]:

casebase=list()
#Function used retrieve relavant predictions for a given evaluation case. We call the prediction scores "similarity" for reusability reasons. 
#Prediction instances are filtered to ensure that they are relevant to the given case.
def retrieve_c_att22(case):
    global casebase
    cases=casebase
    cases=cases[cases["IndustrySubgroupCode"].astype(float)==float(case["IndustrySubgroupCode"])]
    cases=cases[cases["MunicipalityNumber"].astype(int)==int(case["MunicipalityNumber"])]
    results = cases.apply(pd.to_numeric, errors='coerce').fillna(cases).sort_values(by='similarity', ascending=False)
    results['similarity'] = results['similarity'].astype(float)
    results['NonCompliance'] = results['NonCompliance'].astype(int)
    return results


In [4]:
casebase=list()
initCBCopy=casebase
currentCBCopy=casebase

#Fill casebase in case its empty
dataZero=()
def readData():
    global dataZero
    cnxn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=localhost;DATABASE=IJCAI2022;Trusted_Connection=yes;')
    cursor = cnxn.cursor()
    dataZero=pd.read_sql("SELECT  * "+"FROM [IJCAI2022].[dbo].[BayesianDynamicChecklistLocalDb]"+
                      "left join (select NewID() as new, InspectionId as inspID from [IJCAI2022].[dbo].[BayesianDynamicChecklistLocalDb] group by InspectionId) as a on a.inspId=InspectionId"+
                      " where InspectionDateId<20190601 ORDER BY new",cnxn)
    #Query selects items in random order (via the newID method), grouped by the inspectionIDs. 
    #We are doing crossvalidation with random split based on the indices of the randomly retrieved inspection ids. 
    #Thus, we are evaluating outcomes of inspections. This method also prevents data bleed/leakage.

def FindBestParameters():
    global casebase
    global data00
    # Specifying the ODBC driver, server name, database, etc. directly
    #traindateid=" where (InspectionDateId<"+str(year)+"0101 or InspectionDateId>="+str(year+1)+"0101) and InspectionDateId<20190601"
    #testdateid=" where InspectionDateId>="+str(year)+"0101 and InspectionDateId<"+str(year+1)+"0101 and InspectionDateId<20190601"
    cnxn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=localhost;DATABASE=IJCAI2022;Trusted_Connection=yes;')
    cursor = cnxn.cursor()
    data0=pd.read_sql("SELECT  * "+"FROM [IJCAI2022].[dbo].[BayesianDynamicChecklistLocalDb]",cnxn)
    data=pd.DataFrame(data={'NonCompliance':data0['NonCompliance'],'IndustrySubgroupCode':data0['IndustrySubgroupCode'],'ControlPointText': data0["ControlPointText"],"InspectionDateId":data0["InspectionDateId"],"Municipality":data0["Municipality"]})
    #datatestout=data0.loc[round((len(data0)/8)*year):round((len(data0)/8)*(year+1))]
    #print(data)
    #cat_vars=['ControlPointText','IndustryMainAreaCode','IndustrySubgroupCode','OrganisationAge','OrganisationSize','Municipality']
    cat_vars=['ControlPointText','IndustrySubgroupCode',"Municipality"]
    for var in cat_vars:
        cat_list='var'+'_'+var
        cat_list = pd.get_dummies(data[var], prefix=var)
        #print(cat_list)
        data1=data.join(cat_list)
        data=data1
         #checklist.loc[checklist["AntallBrudd"]==-1,"AntallBrudd"]=0
    cat_vars=['ControlPointText','IndustrySubgroupCode',"Municipality"]
    data_vars=data.columns.values.tolist()
    to_keep=[i for i in data_vars if i not in cat_vars]
    #print(to_keep)
    data=data[to_keep]
    cat_vars=['NonCompliance']
    data_vars=data.columns.values.tolist()
    to_keep=[i for i in data_vars if i not in cat_vars]
    data2=data[to_keep]


    data.loc[np.isnan(data['NonCompliance']),"NonCompliance"]=0
    data.loc[data['NonCompliance']<0,"NonCompliance"]=0
    data.loc[data['NonCompliance']>1,"NonCompliance"]=1

    y=data["NonCompliance"]
    data=data.drop(columns=["NonCompliance"])

    data=data.drop(columns=["InspectionDateId"])

    mlp_gs = RandomForestClassifier()

    parameter_space = {
        'n_estimators': [2,10,20,50,100],
        'max_samples': [0.5,1.0],
        'min_samples_split':[2,10]
    }
    
    clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=2, cv=4)
    clf.fit(data,y)

    return clf
 
    
def fillCaseBase(index):
    global casebase
    global dataZero
    # Specifying the ODBC driver, server name, database, etc. directly
    data0=dataZero
    data=pd.DataFrame(data={'NonCompliance':data0['NonCompliance'],'IndustrySubgroupCode':data0['IndustrySubgroupCode'],'ControlPointText': data0["ControlPointText"],"InspectionDateId":data0["InspectionDateId"],"Municipality":data0["Municipality"]})
    datatestout=data0.loc[round((len(data0)/8)*index):round((len(data0)/8)*(index+1))]
    #Select independent variable and convert to one-hot vectors
    cat_vars=['ControlPointText','IndustrySubgroupCode',"Municipality"]
    for var in cat_vars:
        cat_list='var'+'_'+var
        cat_list = pd.get_dummies(data[var], prefix=var)
        data1=data.join(cat_list)
        data=data1
    cat_vars=['ControlPointText','IndustrySubgroupCode',"Municipality"]
    data_vars=data.columns.values.tolist()
    to_keep=[i for i in data_vars if i not in cat_vars]
    data=data[to_keep]


    #Making sure that the target variable is binary
    data.loc[np.isnan(data['NonCompliance']),"NonCompliance"]=0
    data.loc[data['NonCompliance']<0,"NonCompliance"]=0
    data.loc[data['NonCompliance']>1,"NonCompliance"]=1

    #retrieving the test-fold out of the 8 crossvalidation folds (rest are training folds.)
    datatest=data.loc[round((len(data)/8)*index):round((len(data)/8)*(index+1))]
    datatesty=datatest["NonCompliance"]
    datatest=datatest.drop(columns=["NonCompliance"])
    #Retrieving the 7 training folds, leaving the test-fold out.
    datacp=data.copy()
    if index>0 and index <7:                   
        datacp=datacp.loc[0:round((len(datacp)/8)*index)-1]
        datacp=datacp.append(data.loc[round((len(data)/8)*(index+1)):len(data)])
    elif index<1:
        datacp=datacp.loc[round((len(datacp)/8))*(index+1):len(datacp)]
    else:
        datacp=datacp.loc[0:round((len(data)/8)*index)]
    
    data=datacp
    #Creating the target variable, dropping unwanted columns from the training set and training the model.    
    y=data["NonCompliance"]
    data=data.drop(columns=["NonCompliance"])
    data=data.drop(columns=["InspectionDateId"])
    datatest=datatest.drop(columns=["InspectionDateId"])
    clf = RandomForestClassifier(max_samples=0.5, min_samples_split=10,n_estimators=50)
    t=clf.fit(data,y)
    #generate prediction for the test data set. The test data set is used to create and evaluate checklists below.
    tes=clf.predict_proba(datatest)
    tes=(datatestout).assign(similarity=tes[:,1]) #We named the prediction score "similarity" for easy reuse in other scripts.
    tes=tes.assign(NonCompliance=datatesty)
    casebase=tes #Test data set is assigned to "casebase"

# Find the best parameters

In [17]:
start_time = time.time()
clf=FindBestParameters()
traintime=(time.time()-start_time)
print("Training time: "+ str(traintime))

print(clf.best_params_)
print(clf.best_estimator_)
fh2=open("Log_" + "RFChecklistsBestParameters"+ '.txt', 'w+')
fh2.write("Training time: "+ str(traintime))
fh2.write("Best parameters:"+ str(clf.best_params_))
fh2.write("Best estimator:"+ str(clf.best_estimator_))
fh2.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Training time: 18633.6580452919
{'max_samples': 0.5, 'min_samples_split': 10, 'n_estimators': 50}
RandomForestClassifier(max_samples=0.5, min_samples_split=10, n_estimators=50)


# EXPERIMENT, STATISTICS FOR GENERATED CHECKLISTS

In [5]:

currentCBCopy=initCBCopy
readData()
timetotal=0
traintime=0
traintimetot=0
precctot=0
accctot=0
preccgttot=0
recctot=0
fh2=open("Log_" + "RFChecklists" + '.txt', 'w+')
#for-loop to perform 8-fold crossvalidation. ik is used as an index for the validation folds.
for ik in range(0,8):
    
    start_time = time.time()
    fillCaseBase(ik)
    initCBCopy=casebase
    currentCBCopy=initCBCopy

    traintime=(time.time()-start_time)
    traintimetot=traintimetot+(time.time()-start_time)
    print("Train time: "+str(traintime))
    noncompliance=0 #true positive(for ground truth labels only)
    controlpointsgtcount=0 
    noncompliancengt=0 #true positive (for statistical estimates, see main paper)
    controlpointscount=0 
    truepositiveval=0
    truepositive=0
    falsepositivengt=0
    recallval=0
    accuracyval=0
    precision=0
    precisiongt=0
    lengthprecgt=0
    recall=0
    accuracy=0
    lengthprec=0.00001
    lengthvalprec=0
    lengthrec=0.00001
    lengthvalrec=0
    lengthacc=0.00001
    lengthvalacc=0
    truenegative=0
    falsenegative=0
    similarity=0
    counter=0
    Kcp=15
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', -1)
    noncompliantInspection=0
    inspections=casebase.drop_duplicates(subset = ["InspectionId"])
    cases=casebase
    cases["NonCompliance"]=cases["NonCompliance"].astype(int)
    casestest=len(cases.drop_duplicates(subset = ["ControlPointText"]))
    instanceMatches=list()
    negatives=list()
    inspectionstot=0
    precchecklists=0
    recchecklists=0
    accchecklists=0
    precchecklistsval=0
    recchecklistsval=0
    accchecklistsval=0
    print(casestest)
    #Iterate through all the past inspections in the data set. 
    #"case" contains a single inspection (which in practice consists of a set of instances in the data set).
    for ind, case in inspections.iterrows():
   
        inspectionstot+=1
        uniquechecklistlengthval=0
        instanceMatches=list()
        counter+=1
        if counter>500:
            counter=0
            print(ind)
            


        #Retrieve the existing checklist. ExistingChecklist contains the checklist used in the current inspection.
        existingChecklist=cases[cases['InspectionId']==case['InspectionId']]
        #filter validation instances (cases) so that they match the industry code and municipality of the current inspection.
        filteredCases=cases[cases["IndustrySubgroupCode"].astype(float)==float(case["IndustrySubgroupCode"])]
        filteredCases=filteredCases[filteredCases["MunicipalityNumber"].astype(int)==int(case["MunicipalityNumber"])]
        filteredCases["NonCompliance"]=filteredCases["NonCompliance"].astype(float)
        #Temporary assigning the filtered cases. Another filter is applied further down to filter out positive cases.
        negatives=filteredCases
        if len(filteredCases)<=0:#Fail-safe mechanism. filteredCases should in practice never be 0.
            continue
        
        
        #Retrieve a constructed checklist
        checklist=retrieve_c_att22(case)#Retrieve control points that matches each inspection (input case). checklist=CL from paper

        #Drop any retrieved duplicates and remove select the top (Kcp) items with highest prediction score
        uniqueChecklist=checklist.drop_duplicates(subset = ["ControlPointText"])#Find all unique control points by removing duplicates
        uniqueChecklist=uniqueChecklist.head(Kcp)
        if len(uniqueChecklist)>(0):#Safe check to make sure that somethingis on the checklist.
            truepositive=0
            truepositiveval=0
            similarity+=uniqueChecklist["similarity"].sum() #Sums prediction score (prediction score=similarity)
            #Record statistics. sums up the ratio/fraction of non-compliance for each of the (unique) item in the checklist.
            precpercl=0
            lengthorgcl=0
            #Iterate through all the items on a constructed checklist
            for ind2, generatedChecklistControlpoint in uniqueChecklist.iterrows():#Find overlap between the existing and new generated checklist
                #find the predicted negative records for the constructed checklist. uniqueChecklist contains all the predicted positives.
                negatives=negatives[negatives["ControlPointText"]!=generatedChecklistControlpoint["ControlPointText"]]
                #excp contains the items that can be found on both the existing checklist and the generated checklist
                excp=existingChecklist[existingChecklist["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                #calculate ground truth precision for if excp contains an checklist item.
                if len(excp)>0:
                    precpercl+=((excp["NonCompliance"].sum())/len(excp["NonCompliance"])) #used to calculate average ground truth precision
                    lengthorgcl+=1 #used to calculate average ground truth precision 
                    controlpointsgtcount+=1 #Used to find average number of items per checklist
                controlpointscount+=1
                #groups instances from the vaildation set by items and selects instances that matches the current item (generatedChecklistControlPoint)
                instancesMatchingCurrentGenClItem=filteredCases.groupby(["ControlPointText"],as_index=False).mean()
                instancesMatchingCurrentGenClItem=instancesMatchingCurrentGenClItem[instancesMatchingCurrentGenClItem["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                #Finds the total number of instances in the validation set with positive labels.
                summ2=instancesMatchingCurrentGenClItem["NonCompliance"].sum() 
                #Calculate the number of positive instances in the validation set (that matches the item), divided by the total number of instances.
                #the expression summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]) is a float-number between [0,1]
                if len(instancesMatchingCurrentGenClItem["NonCompliance"])>0:
                    noncompliance+=(summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]))
                    truepositiveval+=(summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]))
                #test if there are at least 1 instance in the validation set that matches the current inspection and checklist item.
                matchlen=len(instancesMatchingCurrentGenClItem["NonCompliance"])
                if matchlen>0:
                    lengthvalprec+=1
                    uniquechecklistlengthval+=1
                
                if matchlen==0:#expression is true if validation set record of checklist item does not exist. This expression never comes true for any method other than BCBR/CBCBR.
                    instanceMatches=cases[cases["IndustrySubgroupCode"].astype(float)==float(generatedChecklistControlpoint["IndustrySubgroupCode"])]
                    instanceMatches=instanceMatches[instanceMatches["MunicipalityNumber"].astype(int)==int(generatedChecklistControlpoint["MunicipalityNumber"])]
                    instanceMatches=instanceMatches[instanceMatches["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                    instanceMatches["NonCompliance"]=instanceMatches["NonCompliance"].astype(float)

                    matchesSum=instanceMatches["NonCompliance"].sum()


                    falsepositivengt+=(len(instanceMatches["NonCompliance"])-matchesSum)
                    if len(instanceMatches["NonCompliance"])>0:
                        noncompliancengt+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        truepositive+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        precision+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        lengthprec+=1
                #End of for-loop
            if lengthorgcl>0:
                precisiongt+=(precpercl/lengthorgcl)
                lengthprecgt+=1      

            #Calculate statistics on checklists level
            uniquenegatives=len(negatives.drop_duplicates(subset = ["ControlPointText"]))
            negativecpy=negatives.copy()
            negativecpy["NonCompliance"]=negativecpy["NonCompliance"].astype(float)
            groupedby=negativecpy.groupby(["ControlPointText"],as_index=False).count()
            groupedbys=negativecpy.groupby(["ControlPointText"],as_index=False).sum()
            groupedby["NonCompliance"]=groupedbys["NonCompliance"]/groupedby["NonCompliance"]
            uniquecalcnegatives=groupedby.drop_duplicates(subset = ["ControlPointText"])
            noncompliancenegatives=uniquecalcnegatives["NonCompliance"].sum()

            falsenegativeelement=0
            if uniquenegatives>0:
                falsenegativeelement=noncompliancenegatives/uniquenegatives #To avoid selection bias effects
            truenegativeelement=1-falsenegativeelement

            truepositivesprchecklistval=(truepositiveval)
            falsepositivesprchecklistval=(uniquechecklistlengthval-truepositivesprchecklistval)

            truepositivesprchecklist=(truepositive+truepositiveval)
            falsepositivesprchecklist=(len(uniqueChecklist)-(truepositive+truepositiveval))
            truenegativesprchecklist=truenegativeelement*uniquenegatives
            falsenegativesprchecklist=falsenegativeelement*uniquenegatives


            precprchecklist=truepositivesprchecklist/len(uniqueChecklist)
            precchecklists+=precprchecklist
            precprchecklistval=0
            if uniquechecklistlengthval>0:
                precprchecklistval=truepositivesprchecklistval/uniquechecklistlengthval
            precchecklistsval+=precprchecklistval

            recprchecklist=0
            if (truepositivesprchecklist+falsenegativesprchecklist)>0:
                recprchecklist=truepositivesprchecklist/(truepositivesprchecklist+falsenegativesprchecklist)
            recchecklists+=recprchecklist

            recprchecklistval=0
            if (truepositivesprchecklistval+falsenegativesprchecklist)>0:
                recprchecklistval=truepositivesprchecklistval/(truepositivesprchecklistval+falsenegativesprchecklist)
            recchecklistsval+=recprchecklistval

            accprchecklist=0
            if (len(uniqueChecklist)+uniquenegatives)>0:
                accprchecklist=(truepositivesprchecklist+truenegativesprchecklist)/(len(uniqueChecklist)+uniquenegatives)
            accchecklists+=accprchecklist

            accprchecklistval=0
            if (uniquechecklistlengthval+uniquenegatives)>0:
                accprchecklistval=(truepositivesprchecklistval+truenegativesprchecklist)/(uniquechecklistlengthval+uniquenegatives)
            accchecklistsval+=accprchecklistval

            #|true positives for each checklist|=sum(true positive elements in checklist)
            #|false positives for each checklist|=sum(false positive elements in checklist)
            #|true negatives for each checklist|=|true unique negative elements not in checklist|
            #|false negatives for each checklist|=|false unique negative elements not in checklist|
        #End of inner for loop
    #Recording statistics for the current cross-validation fold.
    if inspectionstot>0:
        precctot+=precchecklists/inspectionstot
        accctot+=accchecklists/inspectionstot
        preccgttot+=precisiongt/lengthprecgt
        recctot+=recchecklists/inspectionstot
    timetotal=timetotal+(time.time()-start_time)
    print("Current avg time:"+ str(timetotal/(ik+1)))
    fh2.write("Current avg time:"+ str(timetotal/(ik+1)))
    
    print("Precision (gt): "+str(precisiongt/lengthprecgt))
    fh2.write("\nPrecision (gt): "+str(precisiongt/lengthprecgt))
    print("Precision(val): "+str((precchecklistsval)/(inspectionstot)))
    fh2.write("\nPrecision(val): "+str((precchecklistsval)/(inspectionstot)))
    print("Precision: "+str((precchecklists)/(inspectionstot)))
    fh2.write("\nPrecision: "+str((precchecklists)/(inspectionstot)))

    print("Recall(val): "+str((recchecklistsval)/(inspectionstot)))
    fh2.write("\nRecall(val): "+str((recchecklistsval)/(inspectionstot)))
    print("Recall: "+str((recchecklists)/(inspectionstot)))
    fh2.write("\nRecall: "+str((recchecklists)/(inspectionstot)))

    print("Accuracy(val): "+str((accchecklistsval)/(inspectionstot)))
    fh2.write("\nAccuracy(val): "+str((accchecklistsval)/(inspectionstot)))
    print("Accuracy: "+str((accchecklists)/(inspectionstot)))
    fh2.write("\nAccuracy: "+str((accchecklists)/(inspectionstot)))
    
    #Additional information
    print("Average number of items per checklist(gt): "+str(controlpointsgtcount/inspectionstot))
    fh2.write("\nAverage number of items per checklist(gt): "+str(controlpointsgtcount/inspectionstot))
    print("Average number of items per checklist: "+str((controlpointscount)/inspectionstot))
    fh2.write("\nAverage number of items per checklist: "+str((controlpointscount)/inspectionstot))
    print("Average number of non-compliant items per checklist(gt): "+str(precisiongt/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist(gt): "+str(precisiongt/inspectionstot))
    print("Average number of non-compliant items per checklist(val): "+str(noncompliance/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist(val): "+str(noncompliance/inspectionstot))
    print("Average number of non-compliant items per checklist: "+str((noncompliance+noncompliancengt)/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist: "+str((noncompliance+noncompliancengt)/inspectionstot))
    #number of common control points between CBR generated and original lists divided by the number of control points in the CBR generated list

    print("Average similarity: "+str(similarity/(lengthprecgt+lengthvalprec)))
    fh2.write("\nAverage similarity: "+str(similarity/(lengthprecgt+lengthvalprec)))
    print("\nTrain time: "+str(traintime))
    fh2.write("\nTrain time: "+str(traintime))


fh2.write("\nAverage Traintime: "+str(traintimetot/8))                                
fh2.write("\nAverage Accuracy: "+str(accctot/8))
fh2.write("\nAverage Prec: "+str(precctot/8))
fh2.write("\nAverage Rec: "+str(recctot/8))
fh2.write("\nAverage Precgt: "+str(preccgttot/8))
fh2.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


0
138939
RandomForestClassifier(max_samples=0.5, min_samples_split=10)
Train time: 3221.3368487358093
950


  pd.set_option('display.max_colwidth', -1)


9402
18474
27802
36929
45999
55258
64426
73762
83036
92396
101655
111012
120358
129621
138871
Current avg time:3750.791635274887
Precision (gt): 0.2524728141399486
Precision(val): 0.28477504637769424
Precision: 0.28477504637769424
Recall(val): 0.6687721054679276
Recall: 0.6687721054679276
Accuracy(val): 0.5170015019465504
Accuracy: 0.5170015019465504
Average number of control points per checklist(gt): 9.705013964622955
Average number of control points per checklist: 14.171166378507781
Average number of non-compliant control points per checklist(gt): 0.23988107251174265
Average number of non-compliant control points per checklist(val): 4.096491175754277
Average number of non-compliant control points per checklist: 4.096491175754277
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6848422850600171
Average similarity: 0.27870879527539444

Train time: 3221.3368487358093


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


138939
277878
RandomForestClassifier(max_samples=0.5, min_samples_split=10)
Train time: 3856.0079085826874
944


  pd.set_option('display.max_colwidth', -1)


148275
157653
166916
176347
185651
195077
204390
213630
222789
232301
241662
250952
260461
269461
Current avg time:4061.4484125375748
Precision (gt): 0.25328777025356874
Precision(val): 0.2844388423769513
Precision: 0.2844388423769513
Recall(val): 0.6708493564897535
Recall: 0.6708493564897535
Accuracy(val): 0.5205134145498695
Accuracy: 0.5205134145498695
Average number of control points per checklist(gt): 9.626423690205012
Average number of control points per checklist: 14.200723569610076
Average number of non-compliant control points per checklist(gt): 0.24062847261125586
Average number of non-compliant control points per checklist(val): 4.097817398702641
Average number of non-compliant control points per checklist: 4.097817398702641
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6778826193621438
Average similarity: 0.28434979273958844

Train time: 3856.0079085826874


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


277878
416816
RandomForestClassifier(max_samples=0.5, min_samples_split=10)
Train time: 3533.9720079898834
975


  pd.set_option('display.max_colwidth', -1)


286779
295678
305029
314469
323734
333186
342468
351386
360284
369642
379019
388320
397664
406836
415926
Current avg time:4034.105592330297
Precision (gt): 0.24608725965789208
Precision(val): 0.27187574459387326
Precision: 0.27187574459387326
Recall(val): 0.6686879606189056
Recall: 0.6686879606189056
Accuracy(val): 0.5139251523697452
Accuracy: 0.5139251523697452
Average number of control points per checklist(gt): 9.67919036909644
Average number of control points per checklist: 14.153591744939806
Average number of non-compliant control points per checklist(gt): 0.23299993615181025
Average number of non-compliant control points per checklist(val): 3.907280340210538
Average number of non-compliant control points per checklist: 3.907280340210538
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6838681335115482
Average similarity: 0.27735777878913315

Train time: 3533.9720079898834


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


416816
555755
RandomForestClassifier(max_samples=0.5, min_samples_split=10)
Train time: 3137.1482152938843
972


  pd.set_option('display.max_colwidth', -1)


425937
435274
444401
453957
463672
473010
482002
491263
500515
509906
519076
528398
537840
547065
Current avg time:3919.8549568653107
Precision (gt): 0.25509335738351757
Precision(val): 0.2935643785203567
Precision: 0.2935643785203567
Recall(val): 0.681155679359142
Recall: 0.681155679359142
Accuracy(val): 0.5219878795273742
Accuracy: 0.5219878795273742
Average number of control points per checklist(gt): 9.677553205728818
Average number of control points per checklist: 14.226609556953553
Average number of non-compliant control points per checklist(gt): 0.24041123401651016
Average number of non-compliant control points per checklist(val): 4.23785724327369
Average number of non-compliant control points per checklist: 4.23785724327369
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6802431153386586
Average similarity: 0.28509493876398667

Train time: 3137.1482152938843


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


555755
694694
RandomForestClassifier(max_samples=0.5, min_samples_split=10)
Train time: 3090.356469631195
987


  pd.set_option('display.max_colwidth', -1)


564823
574435
583934
592929
602296
611679
621030
630222
639466
648570
657753
667351
676527
685626
Current avg time:3843.968590927124
Precision (gt): 0.25337519698909855
Precision(val): 0.28745805926082824
Precision: 0.28745805926082824
Recall(val): 0.6794347232367588
Recall: 0.6794347232367588
Accuracy(val): 0.5184253202753029
Accuracy: 0.5184253202753029
Average number of control points per checklist(gt): 9.695512820512821
Average number of control points per checklist: 14.200320512820513
Average number of non-compliant control points per checklist(gt): 0.23929879715637084
Average number of non-compliant control points per checklist(val): 4.1492924761837156
Average number of non-compliant control points per checklist: 4.1492924761837156
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6827671820336305
Average similarity: 0.2810288930625874

Train time: 3090.356469631195


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


694694
833632
RandomForestClassifier(max_samples=0.5, min_samples_split=10)
Train time: 3179.7451939582825
983


  pd.set_option('display.max_colwidth', -1)


703753
712801
722349
731522
740577
749984
759472
768609
778012
787452
796955
805819
815221
824673
Current avg time:3810.0900650819144
Precision (gt): 0.25222771563232343
Precision(val): 0.2896636742494623
Precision: 0.2896636742494623
Recall(val): 0.6764638897491205
Recall: 0.6764638897491205
Accuracy(val): 0.5172320687856878
Accuracy: 0.5172320687856878
Average number of control points per checklist(gt): 9.733582487987187
Average number of control points per checklist: 14.173918846769888
Average number of non-compliant control points per checklist(gt): 0.24051185270652947
Average number of non-compliant control points per checklist(val): 4.173438435234422
Average number of non-compliant control points per checklist: 4.173438435234422
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6867248636890132
Average similarity: 0.28145134528670135

Train time: 3179.7451939582825


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


833632
972571
RandomForestClassifier(max_samples=0.5, min_samples_split=10)
Train time: 3425.900997400284
965


  pd.set_option('display.max_colwidth', -1)


843011
852366
861286
870733
880211
889490
898795
907930
917304
926496
935659
945051
954428
963705
Current avg time:3821.1916993345535
Precision (gt): 0.25523670480330213
Precision(val): 0.27937758464197243
Precision: 0.27937758464197243
Recall(val): 0.6746786130334028
Recall: 0.6746786130334028
Accuracy(val): 0.5198082544975724
Accuracy: 0.5198082544975724
Average number of control points per checklist(gt): 9.728804493179995
Average number of control points per checklist: 14.236159400909335
Average number of non-compliant control points per checklist(gt): 0.24216427127299125
Average number of non-compliant control points per checklist(val): 4.033593417319791
Average number of non-compliant control points per checklist: 4.033593417319791
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6833868755753443
Average similarity: 0.28293250244138624

Train time: 3425.900997400284


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


972571
1111510
RandomForestClassifier(max_samples=0.5, min_samples_split=10)
Train time: 10335.810318946838
974


  pd.set_option('display.max_colwidth', -1)


981586
990693
999982
1009342
1018848
1028415
1037882
1047118
1056411
1065641
1074493
1083679
1092817
1102285
1111323
Current avg time:4703.863602668047
Precision (gt): 0.25549876364836144
Precision(val): 0.2816447579078633
Precision: 0.2816447579078633
Recall(val): 0.6780733288299267
Recall: 0.6780733288299267
Accuracy(val): 0.5196740058655847
Accuracy: 0.5196740058655847
Average number of control points per checklist(gt): 9.607494020728142
Average number of control points per checklist: 14.140446452298697
Average number of non-compliant control points per checklist(gt): 0.24327719111136833
Average number of non-compliant control points per checklist(val): 4.02591654031217
Average number of non-compliant control points per checklist: 4.02591654031217
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6794335704419241
Average similarity: 0.2809945752586866

Train time: 10335.810318946838
