# SETUP

In [1]:
from requests import get
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from IPython.display import HTML
import pyodbc
import time
from sklearn.tree import DecisionTreeClassifier

from pandas.io.json import json_normalize

In [3]:


casebase=list()
#Function used retrieve relavant predictions for a given evaluation case. We call the prediction scores "similarity" for reusability reasons. 
#Prediction instances are filtered to ensure that they are relevant to the given case.
def retrieve_c_att22(case):
    global casebase
    cases=casebase
    cases=cases[cases["IndustrySubgroupCode"].astype(float)==float(case["IndustrySubgroupCode"])]
    cases=cases[cases["MunicipalityNumber"].astype(int)==int(case["MunicipalityNumber"])]
    results = cases.apply(pd.to_numeric, errors='coerce').fillna(cases).sort_values(by='similarity', ascending=False)
    results['similarity'] = results['similarity'].astype(float)
    results['NonCompliance'] = results['NonCompliance'].astype(int)
    return results


In [4]:
initCBCopy=casebase
currentCBCopy=casebase

#Fill casebase in case its empty
dataZero=()
def readData():
    global dataZero
    cnxn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=localhost;DATABASE=IJCAI2022;Trusted_Connection=yes;')
    cursor = cnxn.cursor()
    dataZero=pd.read_sql("SELECT  * "+"FROM [IJCAI2022].[dbo].[BayesianDynamicChecklistLocalDb]"+
                      "left join (select NewID() as new, InspectionId as inspID from [IJCAI2022].[dbo].[BayesianDynamicChecklistLocalDb] group by InspectionId) as a on a.inspId=InspectionId"+
                      " where InspectionDateId<20190601 ORDER BY new",cnxn)
    #Query selects items in random order (via the newID method), grouped by the inspectionIDs. 
    #We are doing crossvalidation with random split based on the indices of the randomly retrieved inspection ids. 
    #Thus, we are evaluating outcomes of inspections. This method also prevents data bleed/leakage.

def fillCaseBase(index):
    global casebase
    global dataZero
    # Specifying the ODBC driver, server name, database, etc. directly
    data0=dataZero
    data=pd.DataFrame(data={'NonCompliance':data0['NonCompliance'],'IndustrySubgroupCode':data0['IndustrySubgroupCode'],'ControlPointText': data0["ControlPointText"],"InspectionDateId":data0["InspectionDateId"],"Municipality":data0["Municipality"]})
    datatestout=data0.loc[round((len(data0)/8)*index):round((len(data0)/8)*(index+1))]
    #Select independent variable and convert to one-hot vectors
    cat_vars=['ControlPointText','IndustrySubgroupCode',"Municipality"]
    for var in cat_vars:
        cat_list='var'+'_'+var
        cat_list = pd.get_dummies(data[var], prefix=var)
        data1=data.join(cat_list)
        data=data1
    cat_vars=['ControlPointText','IndustrySubgroupCode',"Municipality"]
    data_vars=data.columns.values.tolist()
    to_keep=[i for i in data_vars if i not in cat_vars]
    data=data[to_keep]


    #Making sure that the target variable is binary
    data.loc[np.isnan(data['NonCompliance']),"NonCompliance"]=0
    data.loc[data['NonCompliance']<0,"NonCompliance"]=0
    data.loc[data['NonCompliance']>1,"NonCompliance"]=1

    #retrieving the test-fold out of the 8 crossvalidation folds (rest are training folds.)
    datatest=data.loc[round((len(data)/8)*index):round((len(data)/8)*(index+1))]
    datatesty=datatest["NonCompliance"]
    datatest=datatest.drop(columns=["NonCompliance"])
    #Retrieving the 7 training folds, leaving the test-fold out.
    datacp=data.copy()
    if index>0 and index <7:                   
        datacp=datacp.loc[0:round((len(datacp)/8)*index)-1]
        datacp=datacp.append(data.loc[round((len(data)/8)*(index+1)):len(data)])
    elif index<1:
        datacp=datacp.loc[round((len(datacp)/8))*(index+1):len(datacp)]
    else:
        datacp=datacp.loc[0:round((len(data)/8)*index)]
    
    data=datacp
    #Creating the target variable, dropping unwanted columns from the training set and training the model.    
    y=data["NonCompliance"]
    data=data.drop(columns=["NonCompliance"])
    data=data.drop(columns=["InspectionDateId"])
    datatest=datatest.drop(columns=["InspectionDateId"])
    clf = DecisionTreeClassifier()
    t=clf.fit(data,y)
    #generate prediction for the test data set. The test data set is used to create and evaluate checklists below.
    tes=clf.predict_proba(datatest)
    tes=(datatestout).assign(similarity=tes[:,1]) #We named the prediction score "similarity" for easy reuse in other scripts.
    tes=tes.assign(NonCompliance=datatesty)
    casebase=tes #Test data set is assigned to "casebase"



# EXPERIMENT, STATISTICS FOR GENERATED CHECKLISTS

In [5]:

currentCBCopy=initCBCopy
readData()
timetotal=0
traintime=0
traintimetot=0
precctot=0
accctot=0
preccgttot=0
recctot=0
fh2=open("Log_" + "DTChecklists" + '.txt', 'w+')
#for-loop to perform 8-fold crossvalidation. ik is used as an index for the validation folds.
for ik in range(0,8):
    
    start_time = time.time()
    fillCaseBase(ik)
    initCBCopy=casebase
    currentCBCopy=initCBCopy

    traintime=(time.time()-start_time)
    traintimetot=traintimetot+(time.time()-start_time)
    print("Train time: "+str(traintime))
    noncompliance=0 #true positive(for ground truth labels only)
    controlpointsgtcount=0 
    noncompliancengt=0 #true positive (for statistical estimates, see main paper)
    controlpointscount=0 
    truepositiveval=0
    truepositive=0
    falsepositivengt=0
    recallval=0
    accuracyval=0
    precision=0
    precisiongt=0
    lengthprecgt=0
    recall=0
    accuracy=0
    lengthprec=0.00001
    lengthvalprec=0
    lengthrec=0.00001
    lengthvalrec=0
    lengthacc=0.00001
    lengthvalacc=0
    truenegative=0
    falsenegative=0
    similarity=0
    counter=0
    Kcp=15
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', -1)
    noncompliantInspection=0
    inspections=casebase.drop_duplicates(subset = ["InspectionId"])
    cases=casebase
    cases["NonCompliance"]=cases["NonCompliance"].astype(int)
    casestest=len(cases.drop_duplicates(subset = ["ControlPointText"]))
    instanceMatches=list()
    negatives=list()
    inspectionstot=0
    precchecklists=0
    recchecklists=0
    accchecklists=0
    precchecklistsval=0
    recchecklistsval=0
    accchecklistsval=0
    print(casestest)
    #Iterate through all the past inspections in the data set. 
    #"case" contains a single inspection (which in practice consists of a set of instances in the data set).
    for ind, case in inspections.iterrows():
   
        inspectionstot+=1
        uniquechecklistlengthval=0
        instanceMatches=list()
        counter+=1
        if counter>500:
            counter=0
            print(ind)
            


        #Retrieve the existing checklist. ExistingChecklist contains the checklist used in the current inspection.
        existingChecklist=cases[cases['InspectionId']==case['InspectionId']]
        #filter validation instances (cases) so that they match the industry code and municipality of the current inspection.
        filteredCases=cases[cases["IndustrySubgroupCode"].astype(float)==float(case["IndustrySubgroupCode"])]
        filteredCases=filteredCases[filteredCases["MunicipalityNumber"].astype(int)==int(case["MunicipalityNumber"])]
        filteredCases["NonCompliance"]=filteredCases["NonCompliance"].astype(float)
        #Temporary assigning the filtered cases. Another filter is applied further down to filter out positive cases.
        negatives=filteredCases
        if len(filteredCases)<=0:#Fail-safe mechanism. filteredCases should in practice never be 0.
            continue
        
        
        #Retrieve a constructed checklist
        checklist=retrieve_c_att22(case)#Retrieve control points that matches each inspection (input case). checklist=CL from paper

        #Drop any retrieved duplicates and remove select the top (Kcp) items with highest prediction score
        uniqueChecklist=checklist.drop_duplicates(subset = ["ControlPointText"])#Find all unique control points by removing duplicates
        uniqueChecklist=uniqueChecklist.head(Kcp)
        if len(uniqueChecklist)>(0):#Safe check to make sure that somethingis on the checklist.
            truepositive=0
            truepositiveval=0
            similarity+=uniqueChecklist["similarity"].sum() #Sums prediction score (prediction score=similarity)
            #Record statistics. sums up the ratio/fraction of non-compliance for each of the (unique) item in the checklist.
            precpercl=0
            lengthorgcl=0
            #Iterate through all the items on a constructed checklist
            for ind2, generatedChecklistControlpoint in uniqueChecklist.iterrows():#Find overlap between the existing and new generated checklist
                #find the predicted negative records for the constructed checklist. uniqueChecklist contains all the predicted positives.
                negatives=negatives[negatives["ControlPointText"]!=generatedChecklistControlpoint["ControlPointText"]]
                #excp contains the items that can be found on both the existing checklist and the generated checklist
                excp=existingChecklist[existingChecklist["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                #calculate ground truth precision for if excp contains an checklist item.
                if len(excp)>0:
                    precpercl+=((excp["NonCompliance"].sum())/len(excp["NonCompliance"])) #used to calculate average ground truth precision
                    lengthorgcl+=1 #used to calculate average ground truth precision 
                    controlpointsgtcount+=1 #Used to find average number of items per checklist
                controlpointscount+=1
                #groups instances from the vaildation set by items and selects instances that matches the current item (generatedChecklistControlPoint)
                instancesMatchingCurrentGenClItem=filteredCases.groupby(["ControlPointText"],as_index=False).mean()
                instancesMatchingCurrentGenClItem=instancesMatchingCurrentGenClItem[instancesMatchingCurrentGenClItem["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                #Finds the total number of instances in the validation set with positive labels.
                summ2=instancesMatchingCurrentGenClItem["NonCompliance"].sum() 
                #Calculate the number of positive instances in the validation set (that matches the item), divided by the total number of instances.
                #the expression summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]) is a float-number between [0,1]
                if len(instancesMatchingCurrentGenClItem["NonCompliance"])>0:
                    noncompliance+=(summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]))
                    truepositiveval+=(summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]))
                #test if there are at least 1 instance in the validation set that matches the current inspection and checklist item.
                matchlen=len(instancesMatchingCurrentGenClItem["NonCompliance"])
                if matchlen>0:
                    lengthvalprec+=1
                    uniquechecklistlengthval+=1
                
                if matchlen==0:#expression is true if validation set record of checklist item does not exist. This expression never comes true for any method other than BCBR/CBCBR.
                    instanceMatches=cases[cases["IndustrySubgroupCode"].astype(float)==float(generatedChecklistControlpoint["IndustrySubgroupCode"])]
                    instanceMatches=instanceMatches[instanceMatches["MunicipalityNumber"].astype(int)==int(generatedChecklistControlpoint["MunicipalityNumber"])]
                    instanceMatches=instanceMatches[instanceMatches["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                    instanceMatches["NonCompliance"]=instanceMatches["NonCompliance"].astype(float)

                    matchesSum=instanceMatches["NonCompliance"].sum()


                    falsepositivengt+=(len(instanceMatches["NonCompliance"])-matchesSum)
                    if len(instanceMatches["NonCompliance"])>0:
                        noncompliancengt+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        truepositive+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        precision+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        lengthprec+=1
                #End of for-loop
            if lengthorgcl>0:
                precisiongt+=(precpercl/lengthorgcl)
                lengthprecgt+=1      

            #Calculate statistics on checklists level
            uniquenegatives=len(negatives.drop_duplicates(subset = ["ControlPointText"]))
            negativecpy=negatives.copy()
            negativecpy["NonCompliance"]=negativecpy["NonCompliance"].astype(float)
            groupedby=negativecpy.groupby(["ControlPointText"],as_index=False).count()
            groupedbys=negativecpy.groupby(["ControlPointText"],as_index=False).sum()
            groupedby["NonCompliance"]=groupedbys["NonCompliance"]/groupedby["NonCompliance"]
            uniquecalcnegatives=groupedby.drop_duplicates(subset = ["ControlPointText"])
            noncompliancenegatives=uniquecalcnegatives["NonCompliance"].sum()

            falsenegativeelement=0
            if uniquenegatives>0:
                falsenegativeelement=noncompliancenegatives/uniquenegatives #To avoid selection bias effects
            truenegativeelement=1-falsenegativeelement

            truepositivesprchecklistval=(truepositiveval)
            falsepositivesprchecklistval=(uniquechecklistlengthval-truepositivesprchecklistval)

            truepositivesprchecklist=(truepositive+truepositiveval)
            falsepositivesprchecklist=(len(uniqueChecklist)-(truepositive+truepositiveval))
            truenegativesprchecklist=truenegativeelement*uniquenegatives
            falsenegativesprchecklist=falsenegativeelement*uniquenegatives


            precprchecklist=truepositivesprchecklist/len(uniqueChecklist)
            precchecklists+=precprchecklist
            precprchecklistval=0
            if uniquechecklistlengthval>0:
                precprchecklistval=truepositivesprchecklistval/uniquechecklistlengthval
            precchecklistsval+=precprchecklistval

            recprchecklist=0
            if (truepositivesprchecklist+falsenegativesprchecklist)>0:
                recprchecklist=truepositivesprchecklist/(truepositivesprchecklist+falsenegativesprchecklist)
            recchecklists+=recprchecklist

            recprchecklistval=0
            if (truepositivesprchecklistval+falsenegativesprchecklist)>0:
                recprchecklistval=truepositivesprchecklistval/(truepositivesprchecklistval+falsenegativesprchecklist)
            recchecklistsval+=recprchecklistval

            accprchecklist=0
            if (len(uniqueChecklist)+uniquenegatives)>0:
                accprchecklist=(truepositivesprchecklist+truenegativesprchecklist)/(len(uniqueChecklist)+uniquenegatives)
            accchecklists+=accprchecklist

            accprchecklistval=0
            if (uniquechecklistlengthval+uniquenegatives)>0:
                accprchecklistval=(truepositivesprchecklistval+truenegativesprchecklist)/(uniquechecklistlengthval+uniquenegatives)
            accchecklistsval+=accprchecklistval

            #|true positives for each checklist|=sum(true positive elements in checklist)
            #|false positives for each checklist|=sum(false positive elements in checklist)
            #|true negatives for each checklist|=|true unique negative elements not in checklist|
            #|false negatives for each checklist|=|false unique negative elements not in checklist|
        #End of inner for loop
    #Recording statistics for the current cross-validation fold.
    if inspectionstot>0:
        precctot+=precchecklists/inspectionstot
        accctot+=accchecklists/inspectionstot
        preccgttot+=precisiongt/lengthprecgt
        recctot+=recchecklists/inspectionstot
    timetotal=timetotal+(time.time()-start_time)
    print("Current avg time:"+ str(timetotal/(ik+1)))
    fh2.write("Current avg time:"+ str(timetotal/(ik+1)))
    
    print("Precision (gt): "+str(precisiongt/lengthprecgt))
    fh2.write("\nPrecision (gt): "+str(precisiongt/lengthprecgt))
    print("Precision(val): "+str((precchecklistsval)/(inspectionstot)))
    fh2.write("\nPrecision(val): "+str((precchecklistsval)/(inspectionstot)))
    print("Precision: "+str((precchecklists)/(inspectionstot)))
    fh2.write("\nPrecision: "+str((precchecklists)/(inspectionstot)))

    print("Recall(val): "+str((recchecklistsval)/(inspectionstot)))
    fh2.write("\nRecall(val): "+str((recchecklistsval)/(inspectionstot)))
    print("Recall: "+str((recchecklists)/(inspectionstot)))
    fh2.write("\nRecall: "+str((recchecklists)/(inspectionstot)))

    print("Accuracy(val): "+str((accchecklistsval)/(inspectionstot)))
    fh2.write("\nAccuracy(val): "+str((accchecklistsval)/(inspectionstot)))
    print("Accuracy: "+str((accchecklists)/(inspectionstot)))
    fh2.write("\nAccuracy: "+str((accchecklists)/(inspectionstot)))
    
    #Additional information
    print("Average number of items per checklist(gt): "+str(controlpointsgtcount/inspectionstot))
    fh2.write("\nAverage number of items per checklist(gt): "+str(controlpointsgtcount/inspectionstot))
    print("Average number of items per checklist: "+str((controlpointscount)/inspectionstot))
    fh2.write("\nAverage number of items per checklist: "+str((controlpointscount)/inspectionstot))
    print("Average number of non-compliant items per checklist(gt): "+str(precisiongt/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist(gt): "+str(precisiongt/inspectionstot))
    print("Average number of non-compliant items per checklist(val): "+str(noncompliance/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist(val): "+str(noncompliance/inspectionstot))
    print("Average number of non-compliant items per checklist: "+str((noncompliance+noncompliancengt)/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist: "+str((noncompliance+noncompliancengt)/inspectionstot))
    #number of common control points between CBR generated and original lists divided by the number of control points in the CBR generated list

    print("Average similarity: "+str(similarity/(lengthprecgt+lengthvalprec)))
    fh2.write("\nAverage similarity: "+str(similarity/(lengthprecgt+lengthvalprec)))
    print("\nTrain time: "+str(traintime))
    fh2.write("\nTrain time: "+str(traintime))


fh2.write("\nAverage Traintime: "+str(traintimetot/8))                                
fh2.write("\nAverage Accuracy: "+str(accctot/8))
fh2.write("\nAverage Prec: "+str(precctot/8))
fh2.write("\nAverage Rec: "+str(recctot/8))
fh2.write("\nAverage Precgt: "+str(preccgttot/8))
fh2.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


0
138939
Train time: 2960.0005571842194
946


  pd.set_option('display.max_colwidth', -1)


9245
18732
28102
37191
46565
55672
65057
74204
83520
92872
102193
111279
120701
130303
Current avg time:3437.4648926258087
Precision (gt): 0.22942345748371404
Precision(val): 0.2562756447137184
Precision: 0.2562756447137184
Recall(val): 0.6041306597098799
Recall: 0.6041306597098799
Accuracy(val): 0.4923342977285862
Accuracy: 0.4923342977285862
Average number of control points per checklist(gt): 9.975401069518716
Average number of control points per checklist: 14.219652406417111
Average number of non-compliant control points per checklist(gt): 0.2176148971720523
Average number of non-compliant control points per checklist(val): 3.693339743707616
Average number of non-compliant control points per checklist: 3.693339743707616
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.7015221458589923
Average similarity: 0.2895237424617927

Train time: 2960.0005571842194


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


138939
277878
Train time: 3071.0042572021484
958


  pd.set_option('display.max_colwidth', -1)


148510
158081
167234
176384
185761
194934
204603
213936
223214
232484
241946
251384
260759
270267
Current avg time:3476.3128772974014
Precision (gt): 0.229251309377786
Precision(val): 0.25978847568847474
Precision: 0.25978847568847474
Recall(val): 0.6158652268244006
Recall: 0.6158652268244006
Accuracy(val): 0.49532060153755686
Accuracy: 0.49532060153755686
Average number of control points per checklist(gt): 10.032937365010799
Average number of control points per checklist: 14.208693304535638
Average number of non-compliant control points per checklist(gt): 0.21777017603826676
Average number of non-compliant control points per checklist(val): 3.7255095670062457
Average number of non-compliant control points per checklist: 3.7255095670062457
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.7061125995173764
Average similarity: 0.28919068416344285

Train time: 3071.0042572021484


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


277878
416816
Train time: 3075.627120733261
989


  pd.set_option('display.max_colwidth', -1)


287297
296714
305711
314975
323928
332970
342271
351540
360412
369904
379184
388319
397299
406451
415755
Current avg time:3493.062388976415
Precision (gt): 0.23367308796240885
Precision(val): 0.2692444564125803
Precision: 0.2692444564125803
Recall(val): 0.6149945901986369
Recall: 0.6149945901986369
Accuracy(val): 0.4972237739361635
Accuracy: 0.4972237739361635
Average number of control points per checklist(gt): 9.827827695560254
Average number of control points per checklist: 14.14759513742072
Average number of non-compliant control points per checklist(gt): 0.22027270210409944
Average number of non-compliant control points per checklist(val): 3.8348533994610183
Average number of non-compliant control points per checklist: 3.8348533994610183
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.694664188513949
Average similarity: 0.2975112934054197

Train time: 3075.627120733261


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


416816
555755
Train time: 3080.0578367710114
958


  pd.set_option('display.max_colwidth', -1)


425858
435350
444750
454046
463444
472925
482277
491966
501233
510500
519734
528837
538001
547275
Current avg time:3500.862952411175
Precision (gt): 0.22427578854475955
Precision(val): 0.25463429619358063
Precision: 0.25463429619358063
Recall(val): 0.5989034174768023
Recall: 0.5989034174768023
Accuracy(val): 0.49457401834525333
Accuracy: 0.49457401834525333
Average number of control points per checklist(gt): 9.901032586831166
Average number of control points per checklist: 14.18626793616736
Average number of non-compliant control points per checklist(gt): 0.21167399755639235
Average number of non-compliant control points per checklist(val): 3.6513145378960603
Average number of non-compliant control points per checklist: 3.6513145378960603
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6979307476343974
Average similarity: 0.293328968575932

Train time: 3080.0578367710114


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


555755
694694
Train time: 3052.863732814789
995


  pd.set_option('display.max_colwidth', -1)


565054
573975
583222
592616
601959
611105
620362
629525
638848
648179
657421
666940
675942
685092
694267
Current avg time:3501.8211533546446
Precision (gt): 0.22662165422129166
Precision(val): 0.25719047951633894
Precision: 0.25719047951633894
Recall(val): 0.614588989869497
Recall: 0.614588989869497
Accuracy(val): 0.49420751551036574
Accuracy: 0.49420751551036574
Average number of control points per checklist(gt): 9.846623324930343
Average number of control points per checklist: 14.124054663659281
Average number of non-compliant control points per checklist(gt): 0.2142637532149296
Average number of non-compliant control points per checklist(val): 3.668999027337877
Average number of non-compliant control points per checklist: 3.668999027337877
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6971527340704349
Average similarity: 0.2966134635601451

Train time: 3052.863732814789


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


694694
833632
Train time: 3035.2731277942657
970


  pd.set_option('display.max_colwidth', -1)


703733
712566
722052
731487
740868
750174
759492
769154
778571
787584
797104
806213
815462
824668
Current avg time:3499.2718510627747
Precision (gt): 0.23197925212864012
Precision(val): 0.259603648079615
Precision: 0.259603648079615
Recall(val): 0.6185358644123081
Recall: 0.6185358644123081
Accuracy(val): 0.49569073393434504
Accuracy: 0.49569073393434504
Average number of control points per checklist(gt): 9.91587661904126
Average number of control points per checklist: 14.179596741888103
Average number of non-compliant control points per checklist(gt): 0.21927909277856103
Average number of non-compliant control points per checklist(val): 3.7235316047503573
Average number of non-compliant control points per checklist: 3.7235316047503573
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.699305967549039
Average similarity: 0.29348381398905543

Train time: 3035.2731277942657


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


833632
972571
Train time: 7232.006285667419
987


  pd.set_option('display.max_colwidth', -1)


842736
852231
861724
870888
880384
889671
898988
908263
917429
926497
936006
945020
954287
963349
972344
Current avg time:4150.586717912129
Precision (gt): 0.22602199632726175
Precision(val): 0.25909865444418195
Precision: 0.25909865444418195
Recall(val): 0.6093883243677517
Recall: 0.6093883243677517
Accuracy(val): 0.4919698255004813
Accuracy: 0.4919698255004813
Average number of control points per checklist(gt): 9.930897009966777
Average number of control points per checklist: 14.205714285714286
Average number of non-compliant control points per checklist(gt): 0.21400753805072956
Average number of non-compliant control points per checklist(val): 3.7250834389540186
Average number of non-compliant control points per checklist: 3.7250834389540186
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.6990776253999139
Average similarity: 0.2910620826122341

Train time: 7232.006285667419


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


972571
1111510
Train time: 3628.9952256679535
965


  pd.set_option('display.max_colwidth', -1)


981516
990663
1000046
1009261
1018469
1027978
1037156
1046326
1055428
1064514
1073857
1082936
1092261
1101870
1111235
Current avg time:4165.938957542181
Precision (gt): 0.2279488896423275
Precision(val): 0.24791263192982924
Precision: 0.24791263192982924
Recall(val): 0.601074379924672
Recall: 0.601074379924672
Accuracy(val): 0.48635328954712503
Accuracy: 0.48635328954712503
Average number of control points per checklist(gt): 9.964285714285714
Average number of control points per checklist: 14.143786510886883
Average number of non-compliant control points per checklist(gt): 0.2166906598299343
Average number of non-compliant control points per checklist(val): 3.5419869329538085
Average number of non-compliant control points per checklist: 3.5419869329538085
Percentage of control points in the intersection between the CBR generated and the original checklists: 0.7044991598689583
Average similarity: 0.28838477837560134

Train time: 3628.9952256679535
