# AppliedML Initial Project solution file reader

This notebook is used for reading solutions to the initial project and checking that they are valid.

Note: It will only print the first 5 error messages of each check.

We start by defining the folder holding the solutions

In [10]:
directory = 'solutions'

Then we read all the files in the folder, which correspond to the format, and verify the prediction/variablelist pairs

In [None]:
import os

def init_entry():
    tmp = {}
    tmp['Classification'] = {}
    tmp['Regression'] = {}
    tmp['Clustering'] = {}
    return tmp

def read_filenames(directory):
    tmp = {}
    for filename in os.listdir(directory):
        full_path = os.path.join(directory, filename)
        if not os.path.isfile(full_path) or not filename.lower().endswith('.csv'):
            print(f"Skipping {filename} as it is not a CSV file.")
            continue
        splitted = filename.split('_')
        if len(splitted) != 3 and len(splitted) != 4:
            write_error(f"Filename '{filename}' does not match expected format \n 'TypeOfProblemSolved_FirstnameLastname_SolutionName(_VariableList).csv! \n Your solution includes {len(splitted)} parts instead of 3 or 4. \n Please check the filename and try again.")
            continue

        if len(splitted) == 4 and not splitted[-1].lower() == 'variablelist.csv':
            write_error(f" Your filename '{filename}' includes 4 parts, but the last part is not VariableList.csv. \n Please check the filename and try again.")

        project_part = splitted[0]
        student_name = splitted[1]
        is_varlist = splitted[-1].lower() == 'variablelist.csv'
        implementation = splitted[-2] if is_varlist else splitted[-1].split('.csv')[0]

        if project_part not in ['Classification', 'Regression', 'Clustering']:
            write_error(f"Filename '{filename}' does not match expected format \n 'TypeOfProblemSolved_FirstnameLastname_SolutionName(_VariableList).csv! \n Your solution includes {project_part} instead of Classification, Regression or Clustering. \n Please check the filename and try again.")
            continue
        
        if student_name not in tmp:
            tmp[student_name] = init_entry()
        
        if implementation not in tmp[student_name][project_part]:
            tmp[student_name][project_part][implementation] = {}
        
        if is_varlist:
            tmp[student_name][project_part][implementation]['vars'] = full_path

        else:
            tmp[student_name][project_part][implementation]['preds'] = full_path
    
    return tmp

all_errors = 0
errors = 0
def write_error(msg, cap=5):
    global errors
    if errors < cap:
        print (msg)
    errors += 1
    if errors >= cap:
        print("Too many errors, stopping further checks.")
        exit(1)

names = read_filenames(directory)

Then we can print the structure:

In [12]:
all_errors += errors
errors = 0

for name, parts in names.items():
    print (f'{name}:')
    for part, implementations in parts.items():
        print (f'    {part}:')
        if len(implementations) == 0:
            write_error(f'        {part} does not have any files')
        else:
            for implementation, files in implementations.items():
                if ('vars' not in files) and ('preds' not in files):
                    write_error(f"            {implementation} should have at least a 'VariableList' file and a prediction file")
                else:
                    print (f'        {implementation}:')
                    print(files.keys())
                    print (f'            preds: {files["preds"]}')
                    print (f'             vars:  {files["vars"]}')

if errors == 0:
    print ('Files read succesfully')
else:
    print (f'Reading files gave {errors} errors')

JohannNikolaides:
    Classification:
        LightGBM:
dict_keys(['vars', 'preds'])
            preds: solutions/Classification_JohannNikolaides_LightGBM.csv
             vars:  solutions/Classification_JohannNikolaides_LightGBM_VariableList.csv
    Regression:
        LightGBM:
dict_keys(['vars', 'preds'])
            preds: solutions/Regression_JohannNikolaides_LightGBM.csv
             vars:  solutions/Regression_JohannNikolaides_LightGBM_VariableList.csv
    Clustering:
        HDBSCAN:
dict_keys(['vars', 'preds'])
            preds: solutions/Clustering_JohannNikolaides_HDBSCAN.csv
             vars:  solutions/Clustering_JohannNikolaides_HDBSCAN_VariableList.csv
Files read succesfully


Then we verify the VariableList files

In [13]:
clustering_vars = [
    'Unnamed: 0', 'J', 'H', 'K', 'C_FE', 'N_FE', 'O_FE', 'NA_FE', 'MG_FE', 'AL_FE', 'SI_FE', 'CA_FE', 'S_FE', 'V_FE', 'CR_FE', 'FE_H', 'CO_FE', 'NI_FE', 'E', 'Energy', 'Lz'
    ]

class_reg_vars = [
    "averageInteractionsPerCrossing","p_Rhad1","p_Rhad","p_f3","p_weta2","p_Rphi","p_Reta","p_Eratio","p_f1","p_TRTPID","p_numberOfInnermostPixelHits","p_numberOfPixelHits","p_numberOfSCTHits","p_numberOfTRTHits","p_TRTTrackOccupancy","p_numberOfTRTXenonHits","p_z0","p_d0","p_sigmad0","p_dPOverP","p_deltaEta1","p_deltaPhiRescaled2","p_etcone20","p_etcone30","p_etcone40","p_ptcone20","p_ptcone30","p_ptcone40","p_ptPU30","p_vertex","pX_E7x7_Lr2","pX_E7x7_Lr3","pX_E_Lr0_HiG","pX_E_Lr0_MedG","pX_E_Lr1_HiG","pX_E_Lr1_LowG","pX_E_Lr1_MedG","pX_E_Lr2_HiG","pX_E_Lr2_LowG","pX_E_Lr2_MedG","pX_E_Lr3_HiG","pX_E_Lr3_MedG","pX_MultiLepton","pX_OQ","pX_ambiguityType","pX_asy1","pX_author","pX_barys1","pX_core57cellsEnergyCorrection","pX_deltaEta0","pX_deltaEta1","pX_deltaEta2","pX_deltaEta3","pX_deltaPhi0","pX_deltaPhi1","pX_deltaPhi2","pX_deltaPhi3","pX_deltaPhiFromLastMeasurement","pX_deltaPhiRescaled0","pX_deltaPhiRescaled1","pX_deltaPhiRescaled3","pX_e1152","pX_e132","pX_e235","pX_e255","pX_e2ts1","pX_ecore","pX_emins1","pX_etcone20","pX_etcone30","pX_etcone40","pX_f1core","pX_f3core","pX_maxEcell_energy","pX_maxEcell_gain","pX_maxEcell_time","pX_maxEcell_x","pX_maxEcell_y","pX_maxEcell_z","pX_nCells_Lr0_HiG","pX_nCells_Lr0_MedG","pX_nCells_Lr1_HiG","pX_nCells_Lr1_LowG","pX_nCells_Lr1_MedG","pX_nCells_Lr2_HiG","pX_nCells_Lr2_LowG","pX_nCells_Lr2_MedG","pX_nCells_Lr3_HiG","pX_nCells_Lr3_MedG","pX_neflowisol20","pX_neflowisol30","pX_neflowisol40","pX_neflowisolcoreConeEnergyCorrection","pX_pos","pX_pos7","pX_poscs1","pX_poscs2","pX_ptcone20","pX_ptcone30","pX_ptcone40","pX_ptconecoreTrackPtrCorrection","pX_ptvarcone20","pX_ptvarcone30","pX_ptvarcone40","pX_r33over37allcalo","pX_topoetcone20","pX_topoetcone20ptCorrection","pX_topoetcone30","pX_topoetcone30ptCorrection","pX_topoetcone40","pX_topoetcone40ptCorrection","pX_topoetconecoreConeEnergyCorrection","pX_weta1","pX_widths1","pX_wtots1","pX_e233","pX_e237","pX_e2tsts1","pX_ehad1","pX_emaxs1","pX_fracs1","pX_DeltaE","pX_E3x5_Lr0","pX_E3x5_Lr1","pX_E3x5_Lr2","pX_E3x5_Lr3","pX_E5x7_Lr0","pX_E5x7_Lr1","pX_E5x7_Lr2","pX_E5x7_Lr3","pX_E7x11_Lr0","pX_E7x11_Lr1","pX_E7x11_Lr2","pX_E7x11_Lr3","pX_E7x7_Lr0","pX_E7x7_Lr1","p_pt_track","p_eta","p_phi","p_charge"
]

all_variables = class_reg_vars + clustering_vars

max_variables = {
    'Classification': 25,
    'Regression': 12,
    'Clustering':  7,
}

all_errors += errors
errors = 0
for student_name, parts in names.items():
    for part, implementations in parts.items():
        for implementation, files in implementations.items():
            file = files['vars']
            count = 0
            with open(file, 'r') as f:
                for line in f:
                    var_name = line.rstrip()
                    if var_name == "":
                        print(f" Skipping empty line in {part} file: {file}")
                        continue
                    if var_name.endswith(","):
                        var_name = var_name[:-1]
                    if var_name not in all_variables:
                        write_error(f"Variable '{var_name}' not in the given variable list. Check file: {file}")
                    else:
                        count += 1
            if count > max_variables[part]:
                write_error(f'Used too many variables ({count}/{max_variables[part]}) for {part}: {file}')
                    
if errors == 0:
    print ('Variables parsed without error')
else:
    print (f'Variables had {errors} errors')

 Skipping empty line in Clustering file: solutions/Clustering_JohannNikolaides_HDBSCAN_VariableList.csv
Variables parsed without error


Then we can verify than the solution files

In [None]:
test_entries_class = 60000
test_entries_regre = 40000
test_entries_clust = 5950

prediction_range = {
    'Classification': (0.0, 1.0),
    'Regression': (-float('inf'), float('inf')),
    'Clustering': (-float('inf'), float('inf')),
}

all_errors += errors
errors = 0
for student_name, parts in names.items():
    for part, implementations in parts.items():
        for implementation, files in implementations.items():
            file = files['preds']
            with open(file, 'r') as f:
                lines = [line for line in f]
            for i in range(len(lines)):
                if ',' in lines[i]:
                    try:
                        index, value = lines[i].lstrip().rstrip().split(',')
                    except ValueError:
                        write_error(f"Expected an index and a value but got more at line {i+1}: {lines[i]} in {part} file: {file}")
                        continue
                    try:
                        if int(index) != i:
                            write_error(f'Index at line {i+1} does not have correct index: {index}. Check {part} file: {file}')
                    except ValueError:
                        write_error(f'Unable to cast the index to an integer: {index} in {file}')
                else:
                    value = lines[i].lstrip().rstrip()
                    
                try:
                    value = float(value)
                except ValueError:
                    write_error(f"Value at line {i+1} is not a valid floating point number: {value} in {part} file: {file}")
                    continue
                
                if part == 'Clustering':
                    if value.is_integer():
                        value = int(value)
                    else:
                        write_error(f'Clustering value at {i} is not an integer: {value} in {file}')
                        continue
                mi, ma = prediction_range[part]
                if not (value >= mi and value <= ma):
                    write_error(f'Value at {i} is not in the permitted range of ({mi},{ma}): {value} in {part} {file}')
            if part == 'Classification':
                if len(lines) != test_entries_class:
                    write_error(f'Not correct number of predictions for classification. Got {len(lines)}, expected {test_entries_class}')
            if part == 'Regression':
                if len(lines) != test_entries_regre:
                    write_error(f'Not correct number of predictions for regression. Got {len(lines)}, expected {test_entries_regre}')
            if part == 'Clustering':
                if len(lines) != test_entries_clust:
                    write_error(f'Not correct number of predictions for clustering. Got {len(lines)}, expected {test_entries_clust}')
                
if errors == 0:
    print ('Solutions parsed without error')
else:
    print (f'Solutions had {errors} errors')


Solutions parsed without error


Finally, we check if all of the steps completed without error:

In [23]:
if all_errors == 0:
    print ('All parts of this submission had no errors')
else:
    print (f'This submission had {all_errors} errors')

All parts of this submission had no errors


You are done now! If you want some fireworks to celebrate. Run the cell below, but this is not necessary.

In [None]:
from IPython.display import HTML, display

if all_errors == 0:
    display(HTML("""
    <div style="text-align: center;">
        <h1 style="color: green;">🎉 All parts of this submission had no errors! 🎉</h1>
        <img src="https://media.giphy.com/media/peAFQfg7Ol6IE/giphy.gif" alt="Fireworks" style="width: 50%; height: auto;">
    </div>
    """))
else:
    print(f'This submission had {all_errors} errors')