# SETUP

In [1]:
from requests import get
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from IPython.display import HTML
import pyodbc
import time
from sklearn.linear_model import LogisticRegression
from pandas.io.json import json_normalize

In [2]:


casebase=list()
#Function used retrieve relavant predictions for a given evaluation case. We call the prediction scores "similarity" for reusability reasons. 
#Prediction instances are filtered to ensure that they are relevant to the given case.
def retrieve_c_att22(case):
    global casebase
    cases=casebase
    cases=cases[cases["IndustrySubgroupCode"].astype(float)==float(case["IndustrySubgroupCode"])]
    cases=cases[cases["MunicipalityNumber"].astype(int)==int(case["MunicipalityNumber"])]
    results = cases.apply(pd.to_numeric, errors='coerce').fillna(cases).sort_values(by='similarity', ascending=False)
    results['similarity'] = results['similarity'].astype(float)
    results['NonCompliance'] = results['NonCompliance'].astype(int)
    return results


In [3]:
initCBCopy=casebase
currentCBCopy=casebase

#Fill casebase in case its empty
dataZero=()
def readData():
    global dataZero
    cnxn = pyodbc.connect('DRIVER={SQL Server Native Client 11.0};SERVER=localhost;DATABASE=IJCAI2022;Trusted_Connection=yes;')
    cursor = cnxn.cursor()
    dataZero=pd.read_sql("SELECT  * "+"FROM [IJCAI2022].[dbo].[BayesianDynamicChecklistLocalDb]"+
                      "left join (select NewID() as new, InspectionId as inspID from [IJCAI2022].[dbo].[BayesianDynamicChecklistLocalDb] group by InspectionId) as a on a.inspId=InspectionId"+
                      " where InspectionDateId<20190601 ORDER BY new",cnxn)
    #Query selects items in random order (via the newID method), grouped by the inspectionIDs. 
    #We are doing crossvalidation with random split based on the indices of the randomly retrieved inspection ids. 
    #Thus, we are evaluating outcomes of inspections. This method also prevents data bleed/leakage.

def fillCaseBase(index):
    global casebase
    global dataZero
    # Specifying the ODBC driver, server name, database, etc. directly
    data0=dataZero
    data=pd.DataFrame(data={'NonCompliance':data0['NonCompliance'],'IndustrySubgroupCode':data0['IndustrySubgroupCode'],'ControlPointText': data0["ControlPointText"],"InspectionDateId":data0["InspectionDateId"],"Municipality":data0["Municipality"]})
    datatestout=data0.loc[round((len(data0)/8)*index):round((len(data0)/8)*(index+1))]
    #Select independent variable and convert to one-hot vectors
    cat_vars=['ControlPointText','IndustrySubgroupCode',"Municipality"]
    for var in cat_vars:
        cat_list='var'+'_'+var
        cat_list = pd.get_dummies(data[var], prefix=var)
        data1=data.join(cat_list)
        data=data1
    cat_vars=['ControlPointText','IndustrySubgroupCode',"Municipality"]
    data_vars=data.columns.values.tolist()
    to_keep=[i for i in data_vars if i not in cat_vars]
    data=data[to_keep]


    #Making sure that the target variable is binary
    data.loc[np.isnan(data['NonCompliance']),"NonCompliance"]=0
    data.loc[data['NonCompliance']<0,"NonCompliance"]=0
    data.loc[data['NonCompliance']>1,"NonCompliance"]=1

    #retrieving the test-fold out of the 8 crossvalidation folds (rest are training folds.)
    datatest=data.loc[round((len(data)/8)*index):round((len(data)/8)*(index+1))]
    datatesty=datatest["NonCompliance"]
    datatest=datatest.drop(columns=["NonCompliance"])
    #Retrieving the 7 training folds, leaving the test-fold out.
    datacp=data.copy()
    if index>0 and index <7:                   
        datacp=datacp.loc[0:round((len(datacp)/8)*index)-1]
        datacp=datacp.append(data.loc[round((len(data)/8)*(index+1)):len(data)])
    elif index<1:
        datacp=datacp.loc[round((len(datacp)/8))*(index+1):len(datacp)]
    else:
        datacp=datacp.loc[0:round((len(data)/8)*index)]
    
    data=datacp
    #Creating the target variable, dropping unwanted columns from the training set and training the model.    
    y=data["NonCompliance"]
    data=data.drop(columns=["NonCompliance"])
    data=data.drop(columns=["InspectionDateId"])
    datatest=datatest.drop(columns=["InspectionDateId"])
    clf = LogisticRegression()
    t=clf.fit(data,y)
    #generate prediction for the test data set. The test data set is used to create and evaluate checklists below.
    tes=clf.predict_proba(datatest)
    tes=(datatestout).assign(similarity=tes[:,1]) #We named the prediction score "similarity" for easy reuse in other scripts.
    tes=tes.assign(NonCompliance=datatesty)
    casebase=tes #Test data set is assigned to "casebase"



# EXPERIMENT, STATISTICS FOR GENERATED CHECKLISTS

In [4]:

currentCBCopy=initCBCopy
readData()
timetotal=0
traintime=0
traintimetot=0
precctot=0
accctot=0
preccgttot=0
recctot=0
fh2=open("Log_" + "LRChecklists" + '.txt', 'w+')
#for-loop to perform 8-fold crossvalidation. ik is used as an index for the validation folds.
for ik in range(0,8):
    
    start_time = time.time()
    fillCaseBase(ik)
    initCBCopy=casebase
    currentCBCopy=initCBCopy

    traintime=(time.time()-start_time)
    traintimetot=traintimetot+(time.time()-start_time)
    print("Train time: "+str(traintime))
    noncompliance=0 #true positive(for ground truth labels only)
    controlpointsgtcount=0 
    noncompliancengt=0 #true positive (for statistical estimates, see main paper)
    controlpointscount=0 
    truepositiveval=0
    truepositive=0
    falsepositivengt=0
    recallval=0
    accuracyval=0
    precision=0
    precisiongt=0
    lengthprecgt=0
    recall=0
    accuracy=0
    lengthprec=0.00001
    lengthvalprec=0
    lengthrec=0.00001
    lengthvalrec=0
    lengthacc=0.00001
    lengthvalacc=0
    truenegative=0
    falsenegative=0
    similarity=0
    counter=0
    Kcp=15
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', -1)
    noncompliantInspection=0
    inspections=casebase.drop_duplicates(subset = ["InspectionId"])
    cases=casebase
    cases["NonCompliance"]=cases["NonCompliance"].astype(int)
    casestest=len(cases.drop_duplicates(subset = ["ControlPointText"]))
    instanceMatches=list()
    negatives=list()
    inspectionstot=0
    precchecklists=0
    recchecklists=0
    accchecklists=0
    precchecklistsval=0
    recchecklistsval=0
    accchecklistsval=0
    print(casestest)
    #Iterate through all the past inspections in the data set. 
    #"case" contains a single inspection (which in practice consists of a set of instances in the data set).
    for ind, case in inspections.iterrows():
   
        inspectionstot+=1
        uniquechecklistlengthval=0
        instanceMatches=list()
        counter+=1
        if counter>500:
            counter=0
            print(ind)
            


        #Retrieve the existing checklist. ExistingChecklist contains the checklist used in the current inspection.
        existingChecklist=cases[cases['InspectionId']==case['InspectionId']]
        #filter validation instances (cases) so that they match the industry code and municipality of the current inspection.
        filteredCases=cases[cases["IndustrySubgroupCode"].astype(float)==float(case["IndustrySubgroupCode"])]
        filteredCases=filteredCases[filteredCases["MunicipalityNumber"].astype(int)==int(case["MunicipalityNumber"])]
        filteredCases["NonCompliance"]=filteredCases["NonCompliance"].astype(float)
        #Temporary assigning the filtered cases. Another filter is applied further down to filter out positive cases.
        negatives=filteredCases
        if len(filteredCases)<=0:#Fail-safe mechanism. filteredCases should in practice never be 0.
            continue
        
        
        #Retrieve a constructed checklist
        checklist=retrieve_c_att22(case)#Retrieve control points that matches each inspection (input case). checklist=CL from paper

        #Drop any retrieved duplicates and remove select the top (Kcp) items with highest prediction score
        uniqueChecklist=checklist.drop_duplicates(subset = ["ControlPointText"])#Find all unique control points by removing duplicates
        uniqueChecklist=uniqueChecklist.head(Kcp)
        if len(uniqueChecklist)>(0):#Safe check to make sure that somethingis on the checklist.
            truepositive=0
            truepositiveval=0
            similarity+=uniqueChecklist["similarity"].sum() #Sums prediction score (prediction score=similarity)
            #Record statistics. sums up the ratio/fraction of non-compliance for each of the (unique) item in the checklist.
            precpercl=0
            lengthorgcl=0
            #Iterate through all the items on a constructed checklist
            for ind2, generatedChecklistControlpoint in uniqueChecklist.iterrows():#Find overlap between the existing and new generated checklist
                #find the predicted negative records for the constructed checklist. uniqueChecklist contains all the predicted positives.
                negatives=negatives[negatives["ControlPointText"]!=generatedChecklistControlpoint["ControlPointText"]]
                #excp contains the items that can be found on both the existing checklist and the generated checklist
                excp=existingChecklist[existingChecklist["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                #calculate ground truth precision for if excp contains an checklist item.
                if len(excp)>0:
                    precpercl+=((excp["NonCompliance"].sum())/len(excp["NonCompliance"])) #used to calculate average ground truth precision
                    lengthorgcl+=1 #used to calculate average ground truth precision 
                    controlpointsgtcount+=1 #Used to find average number of items per checklist
                controlpointscount+=1
                #groups instances from the vaildation set by items and selects instances that matches the current item (generatedChecklistControlPoint)
                instancesMatchingCurrentGenClItem=filteredCases.groupby(["ControlPointText"],as_index=False).mean()
                instancesMatchingCurrentGenClItem=instancesMatchingCurrentGenClItem[instancesMatchingCurrentGenClItem["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                #Finds the total number of instances in the validation set with positive labels.
                summ2=instancesMatchingCurrentGenClItem["NonCompliance"].sum() 
                #Calculate the number of positive instances in the validation set (that matches the item), divided by the total number of instances.
                #the expression summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]) is a float-number between [0,1]
                if len(instancesMatchingCurrentGenClItem["NonCompliance"])>0:
                    noncompliance+=(summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]))
                    truepositiveval+=(summ2/len(instancesMatchingCurrentGenClItem["NonCompliance"]))
                #test if there are at least 1 instance in the validation set that matches the current inspection and checklist item.
                matchlen=len(instancesMatchingCurrentGenClItem["NonCompliance"])
                if matchlen>0:
                    lengthvalprec+=1
                    uniquechecklistlengthval+=1
                
                if matchlen==0:#expression is true if validation set record of checklist item does not exist. This expression never comes true for any method other than BCBR/CBCBR.
                    instanceMatches=cases[cases["IndustrySubgroupCode"].astype(float)==float(generatedChecklistControlpoint["IndustrySubgroupCode"])]
                    instanceMatches=instanceMatches[instanceMatches["MunicipalityNumber"].astype(int)==int(generatedChecklistControlpoint["MunicipalityNumber"])]
                    instanceMatches=instanceMatches[instanceMatches["ControlPointText"]==generatedChecklistControlpoint["ControlPointText"]]
                    instanceMatches["NonCompliance"]=instanceMatches["NonCompliance"].astype(float)

                    matchesSum=instanceMatches["NonCompliance"].sum()


                    falsepositivengt+=(len(instanceMatches["NonCompliance"])-matchesSum)
                    if len(instanceMatches["NonCompliance"])>0:
                        noncompliancengt+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        truepositive+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        precision+=(matchesSum/len(instanceMatches["NonCompliance"]))
                        lengthprec+=1
                #End of for-loop
            if lengthorgcl>0:
                precisiongt+=(precpercl/lengthorgcl)
                lengthprecgt+=1      

            #Calculate statistics on checklists level
            uniquenegatives=len(negatives.drop_duplicates(subset = ["ControlPointText"]))
            negativecpy=negatives.copy()
            negativecpy["NonCompliance"]=negativecpy["NonCompliance"].astype(float)
            groupedby=negativecpy.groupby(["ControlPointText"],as_index=False).count()
            groupedbys=negativecpy.groupby(["ControlPointText"],as_index=False).sum()
            groupedby["NonCompliance"]=groupedbys["NonCompliance"]/groupedby["NonCompliance"]
            uniquecalcnegatives=groupedby.drop_duplicates(subset = ["ControlPointText"])
            noncompliancenegatives=uniquecalcnegatives["NonCompliance"].sum()

            falsenegativeelement=0
            if uniquenegatives>0:
                falsenegativeelement=noncompliancenegatives/uniquenegatives #To avoid selection bias effects
            truenegativeelement=1-falsenegativeelement

            truepositivesprchecklistval=(truepositiveval)
            falsepositivesprchecklistval=(uniquechecklistlengthval-truepositivesprchecklistval)

            truepositivesprchecklist=(truepositive+truepositiveval)
            falsepositivesprchecklist=(len(uniqueChecklist)-(truepositive+truepositiveval))
            truenegativesprchecklist=truenegativeelement*uniquenegatives
            falsenegativesprchecklist=falsenegativeelement*uniquenegatives


            precprchecklist=truepositivesprchecklist/len(uniqueChecklist)
            precchecklists+=precprchecklist
            precprchecklistval=0
            if uniquechecklistlengthval>0:
                precprchecklistval=truepositivesprchecklistval/uniquechecklistlengthval
            precchecklistsval+=precprchecklistval

            recprchecklist=0
            if (truepositivesprchecklist+falsenegativesprchecklist)>0:
                recprchecklist=truepositivesprchecklist/(truepositivesprchecklist+falsenegativesprchecklist)
            recchecklists+=recprchecklist

            recprchecklistval=0
            if (truepositivesprchecklistval+falsenegativesprchecklist)>0:
                recprchecklistval=truepositivesprchecklistval/(truepositivesprchecklistval+falsenegativesprchecklist)
            recchecklistsval+=recprchecklistval

            accprchecklist=0
            if (len(uniqueChecklist)+uniquenegatives)>0:
                accprchecklist=(truepositivesprchecklist+truenegativesprchecklist)/(len(uniqueChecklist)+uniquenegatives)
            accchecklists+=accprchecklist

            accprchecklistval=0
            if (uniquechecklistlengthval+uniquenegatives)>0:
                accprchecklistval=(truepositivesprchecklistval+truenegativesprchecklist)/(uniquechecklistlengthval+uniquenegatives)
            accchecklistsval+=accprchecklistval

            #|true positives for each checklist|=sum(true positive elements in checklist)
            #|false positives for each checklist|=sum(false positive elements in checklist)
            #|true negatives for each checklist|=|true unique negative elements not in checklist|
            #|false negatives for each checklist|=|false unique negative elements not in checklist|
        #End of inner for loop
    #Recording statistics for the current cross-validation fold.
    if inspectionstot>0:
        precctot+=precchecklists/inspectionstot
        accctot+=accchecklists/inspectionstot
        preccgttot+=precisiongt/lengthprecgt
        recctot+=recchecklists/inspectionstot
    timetotal=timetotal+(time.time()-start_time)
    print("Current avg time:"+ str(timetotal/(ik+1)))
    fh2.write("Current avg time:"+ str(timetotal/(ik+1)))
    
    print("Precision (gt): "+str(precisiongt/lengthprecgt))
    fh2.write("\nPrecision (gt): "+str(precisiongt/lengthprecgt))
    print("Precision(val): "+str((precchecklistsval)/(inspectionstot)))
    fh2.write("\nPrecision(val): "+str((precchecklistsval)/(inspectionstot)))
    print("Precision: "+str((precchecklists)/(inspectionstot)))
    fh2.write("\nPrecision: "+str((precchecklists)/(inspectionstot)))

    print("Recall(val): "+str((recchecklistsval)/(inspectionstot)))
    fh2.write("\nRecall(val): "+str((recchecklistsval)/(inspectionstot)))
    print("Recall: "+str((recchecklists)/(inspectionstot)))
    fh2.write("\nRecall: "+str((recchecklists)/(inspectionstot)))

    print("Accuracy(val): "+str((accchecklistsval)/(inspectionstot)))
    fh2.write("\nAccuracy(val): "+str((accchecklistsval)/(inspectionstot)))
    print("Accuracy: "+str((accchecklists)/(inspectionstot)))
    fh2.write("\nAccuracy: "+str((accchecklists)/(inspectionstot)))
    
    #Additional information
    print("Average number of items per checklist(gt): "+str(controlpointsgtcount/inspectionstot))
    fh2.write("\nAverage number of items per checklist(gt): "+str(controlpointsgtcount/inspectionstot))
    print("Average number of items per checklist: "+str((controlpointscount)/inspectionstot))
    fh2.write("\nAverage number of items per checklist: "+str((controlpointscount)/inspectionstot))
    print("Average number of non-compliant items per checklist(gt): "+str(precisiongt/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist(gt): "+str(precisiongt/inspectionstot))
    print("Average number of non-compliant items per checklist(val): "+str(noncompliance/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist(val): "+str(noncompliance/inspectionstot))
    print("Average number of non-compliant items per checklist: "+str((noncompliance+noncompliancengt)/inspectionstot))
    fh2.write("\nAverage number of non-compliant items per checklist: "+str((noncompliance+noncompliancengt)/inspectionstot))
    #number of common control points between CBR generated and original lists divided by the number of control points in the CBR generated list

    print("Average similarity: "+str(similarity/(lengthprecgt+lengthvalprec)))
    fh2.write("\nAverage similarity: "+str(similarity/(lengthprecgt+lengthvalprec)))
    print("\nTrain time: "+str(traintime))
    fh2.write("\nTrain time: "+str(traintime))


fh2.write("\nAverage Traintime: "+str(traintimetot/8))                                
fh2.write("\nAverage Accuracy: "+str(accctot/8))
fh2.write("\nAverage Prec: "+str(precctot/8))
fh2.write("\nAverage Rec: "+str(recctot/8))
fh2.write("\nAverage Precgt: "+str(preccgttot/8))
fh2.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iter

Train time: 185.17639255523682
932


  pd.set_option('display.max_colwidth', -1)


8777
17857
26811
35553
44520
53440
62526
71493
80670
89625
98574
107668
116610
125224
134046
Current avg time:751.0902471542358
Precision (gt): 0.24605117143982905
Precision(val): 0.26846067133809176
Precision: 0.26846067133809176
Recall(val): 0.6675941862034626
Recall: 0.6675941862034626
Accuracy(val): 0.5068443621536176
Accuracy: 0.5068443621536176
Average number of items per checklist(gt): 9.532855436081242
Average number of items per checklist: 14.088012743926722
Average number of non-compliant items per checklist(gt): 0.23321457109788654
Average number of non-compliant items per checklist(val): 3.82370839026668
Average number of non-compliant items per checklist: 3.82370839026668
Average similarity: 0.2616551104480351

Train time: 185.17639255523682


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iter

Train time: 187.58375191688538
951


  pd.set_option('display.max_colwidth', -1)


143329
152596
161776
170461
179679
188588
197987
207111
215916
224743
233932
242954
251941
261191
Current avg time:781.0392172336578
Precision (gt): 0.25006847758579165
Precision(val): 0.27495199833298267
Precision: 0.27495199833298267
Recall(val): 0.6865909770550614
Recall: 0.6865909770550614
Accuracy(val): 0.5087183561620886
Accuracy: 0.5087183561620886
Average number of items per checklist(gt): 9.698455339153794
Average number of items per checklist: 14.143183344526529
Average number of non-compliant items per checklist(gt): 0.23767421724607946
Average number of non-compliant items per checklist(val): 3.941200683008367
Average number of non-compliant items per checklist: 3.941200683008367
Average similarity: 0.25562962919862775

Train time: 187.58375191688538


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iter

Train time: 208.89551377296448
952


  pd.set_option('display.max_colwidth', -1)


277567
286753
295708
305027
313720
322491
331733
340696
349724
358561
367553
376343
385445
394523
Current avg time:805.2878717581431
Precision (gt): 0.2489411624946241
Precision(val): 0.27439096029102006
Precision: 0.27439096029102006
Recall(val): 0.6836328508425119
Recall: 0.6836328508425119
Accuracy(val): 0.5043866347557607
Accuracy: 0.5043866347557607
Average number of items per checklist(gt): 9.708266666666667
Average number of items per checklist: 14.115866666666667
Average number of non-compliant items per checklist(gt): 0.2374898690198714
Average number of non-compliant items per checklist(val): 3.9346856109364694
Average number of non-compliant items per checklist: 3.9346856109364694
Average similarity: 0.25600199659992184

Train time: 208.89551377296448


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iter

Train time: 201.0228989124298
941


  pd.set_option('display.max_colwidth', -1)


412089
421035
429868
439261
448197
457029
466255
475008
483842
492811
501901
511047
519736
528546
537291
Current avg time:817.253004014492
Precision (gt): 0.24661060640918064
Precision(val): 0.2655970051027596
Precision: 0.2655970051027596
Recall(val): 0.6796510715386731
Recall: 0.6796510715386731
Accuracy(val): 0.5084913069784369
Accuracy: 0.5084913069784369
Average number of items per checklist(gt): 9.577878103837472
Average number of items per checklist: 14.113530739609613
Average number of non-compliant items per checklist(gt): 0.23410161004106125
Average number of non-compliant items per checklist(val): 3.7720140995791165
Average number of non-compliant items per checklist: 3.7720140995791165
Average similarity: 0.25688676577906294

Train time: 201.0228989124298


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iter

Train time: 200.4536647796631
930


  pd.set_option('display.max_colwidth', -1)


546686
555453
564468
573587
582473
591410
600242
608966
617736
626661
635610
644692
654037
663122
Current avg time:822.4914518356323
Precision (gt): 0.24503376621510214
Precision(val): 0.27233245215351337
Precision: 0.27233245215351337
Recall(val): 0.6851105146546308
Recall: 0.6851105146546308
Accuracy(val): 0.5072169392223947
Accuracy: 0.5072169392223947
Average number of items per checklist(gt): 9.621066666666668
Average number of items per checklist: 14.1104
Average number of non-compliant items per checklist(gt): 0.23314146076146253
Average number of non-compliant items per checklist(val): 3.88938829190416
Average number of non-compliant items per checklist: 3.88938829190416
Average similarity: 0.2562171542410899

Train time: 200.4536647796631


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iter

Train time: 207.33230757713318
931


  pd.set_option('display.max_colwidth', -1)


680813
690204
699349
708463
717703
726521
735490
744417
753485
762475
771523
780351
789386
798207
Current avg time:827.4251304864883
Precision (gt): 0.24346487969571076
Precision(val): 0.2700770243278543
Precision: 0.2700770243278543
Recall(val): 0.6713935935586727
Recall: 0.6713935935586727
Accuracy(val): 0.5129521990668883
Accuracy: 0.5129521990668883
Average number of items per checklist(gt): 9.5139911634757
Average number of items per checklist: 14.127192395233632
Average number of non-compliant items per checklist(gt): 0.22954608151254455
Average number of non-compliant items per checklist(val): 3.851662351125676
Average number of non-compliant items per checklist: 3.851662351125676
Average similarity: 0.26170531808383546

Train time: 207.33230757713318


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iter

Train time: 206.58186197280884
906


  pd.set_option('display.max_colwidth', -1)


815123
824181
832970
842109
851296
860704
869486
878298
887209
896226
905257
913908
922950
931899
Current avg time:828.4693909032004
Precision (gt): 0.24320442385356597
Precision(val): 0.2654662115282671
Precision: 0.2654662115282671
Recall(val): 0.6698700103335592
Recall: 0.6698700103335592
Accuracy(val): 0.5053627126994931
Accuracy: 0.5053627126994931
Average number of items per checklist(gt): 9.572075672795098
Average number of items per checklist: 14.108180122568612
Average number of non-compliant items per checklist(gt): 0.22985507365003957
Average number of non-compliant items per checklist(val): 3.796588621662148
Average number of non-compliant items per checklist: 3.796588621662148
Average similarity: 0.2604757819586085

Train time: 206.58186197280884


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iter

Train time: 206.0774383544922
955


  pd.set_option('display.max_colwidth', -1)


949531
958733
967780
976761
985896
995124
1003849
1012945
1021966
1030801
1039556
1048662
1057370
1066227
Current avg time:832.5038402378559
Precision (gt): 0.24580134586431077
Precision(val): 0.27071919215597595
Precision: 0.27071919215597595
Recall(val): 0.6845816302659589
Recall: 0.6845816302659589
Accuracy(val): 0.5073213923621522
Accuracy: 0.5073213923621522
Average number of items per checklist(gt): 9.645719611236853
Average number of items per checklist: 14.14165890027959
Average number of non-compliant items per checklist(gt): 0.23388925827349608
Average number of non-compliant items per checklist(val): 3.8815911360246087
Average number of non-compliant items per checklist: 3.8815911360246087
Average similarity: 0.25585517094324456

Train time: 206.0774383544922
