# Validatation - AUC

Average use-case size (AUC). The simplest prediction model.

## Imports

In [2]:
import pandas as pd

from paths import input_folder, output_folder

import numpy as np
from numpy.random import seed as np_seed

from sklearn.dummy import DummyRegressor

# Load data

In [2]:
use_cases_df = pd.read_csv(f"{input_folder}use-cases.csv", index_col=0)
use_cases_df.head(5)

Unnamed: 0,ProjectID,UC,TransTypes,UCType,Cfp,TitleTokens
0,P01,UC2-1-1,C|D|R|U,C|D|R|U,16,manage faculties crud
1,P01,UC2-1-10,DL|L|R,L,27,assign science olympiads major specialty edit ...
2,P01,UC2-1-11,CS|R,CS,7,manage ranking algorithms
3,P01,UC2-1-13,C|D|R|U,C|D|R|U,17,manage exams crud
4,P01,UC2-1-14,DL|L|R,L,27,manage assignments exams majors specialties


## Validation framework

In [3]:
random_seed = 100239

In [3]:
train_ids = pd.read_csv(f"{input_folder}10-fold-train-full.csv", index_col=0)
val_ids = pd.read_csv(f"{input_folder}10-fold-val-full.csv", index_col=0)

In [5]:
val_ids

Unnamed: 0,run,k,0,1,2,3,4,5,6,7,...,34,35,36,37,38,39,40,41,42,43
0,0,0,4,13,14,15,17,23,52,54,...,322,359,368,369,372,380,386,418,422,425.0
1,0,1,18,27,46,56,62,76,83,86,...,334,338,346,358,360,366,378,393,398,400.0
2,0,2,12,35,41,42,44,49,53,74,...,345,354,363,397,405,410,411,412,424,431.0
3,0,3,1,5,43,71,72,79,80,95,...,365,371,381,420,427,428,429,430,434,436.0
4,0,4,6,22,24,28,31,34,113,141,...,351,379,396,401,408,414,419,421,432,435.0
5,0,5,3,10,38,45,50,58,73,75,...,353,361,373,374,388,394,403,404,415,416.0
6,0,6,0,7,16,29,60,97,99,115,...,302,328,341,342,364,370,390,402,409,423.0
7,0,7,11,19,21,26,33,36,37,40,...,356,375,377,382,383,389,395,413,433,
8,0,8,20,25,30,32,39,69,70,81,...,355,362,367,384,387,392,399,417,426,
9,0,9,2,8,9,55,63,66,92,93,...,323,327,332,337,376,385,391,406,407,


In [5]:
runs = train_ids.run.max() + 1 
k = train_ids.k.max() + 1

In [6]:
def validate_model_per_use_case(model, use_cases_df, X_data, y_data, runs, k, train_ids, val_ids):
    print("Starting validation...")
    results_df = None
    
    for run in range(runs):
        print(f"Starting run {run+1}...")
        run_results = dict(run=[], k=[], ProjectID=[], UC=[], y_true=[], y_pred=[])
        for ki in range(k):
            print(f"Staring fold {ki+1} (run={run+1})...")
            print("Preparing training data...")
            train_index = train_ids[(train_ids['run'] == run) & (train_ids['k'] == ki)].iloc[0, 2:].dropna().tolist()
            X_train = X_data.iloc[train_index, :]
            y_train = y_data.iloc[train_index]
            print(f'Training dataset shape: {X_train.shape}')
            
            print(f'Fitting the model...')
            model.fit(X_train, y_train)
            
            val_index = val_ids[(val_ids['run'] == run) & (val_ids['k'] == ki)].iloc[0, 2:].dropna().tolist()
            X_val = X_data.iloc[val_index, :]
            y_true = y_data.iloc[val_index]
            pred_use_cases = use_cases_df.iloc[val_index, :]
            pred_use_cases = pred_use_cases.loc[:, ['ProjectID', "UC"]]
            print(f'Validation dataset shape: {X_val.shape}')    
            
            print("Predicting size...")
            y_pred = model.predict(X_val)
            
            run_results['run'] += [run] * len(val_index)
            run_results['k'] += [ki] * len(val_index)
            run_results['ProjectID'] += pred_use_cases['ProjectID'].tolist()
            run_results['UC'] += pred_use_cases['UC'].tolist()
            run_results['y_true'] += y_true.tolist()
            run_results['y_pred'] += y_pred.tolist()
        
        run_results_df = pd.DataFrame(run_results)
        run_results_df['ar'] = run_results_df.apply(lambda x: np.abs(x['y_true'] - x['y_pred']), axis=1 )
        run_results_df['re'] = run_results_df.apply(lambda x: np.abs(x['y_true'] - x['y_pred']) / x['y_true'], axis=1 )
        print(f"Run results...")
        print(dict(MAR=np.mean(run_results_df['ar']), 
                   MAR_SD=np.std(run_results_df['ar']),
                   MdAR=np.median(run_results_df['ar']),
                   MdAR_SD=np.std(run_results_df['ar'])))
        
        if results_df is None:
            results_df = run_results_df
        else:
            results_df = pd.concat([results_df, run_results_df], axis=0)
            
    results_agg = dict(MAR=np.mean(results_df['ar']), 
                   MAR_SD=np.std(results_df['ar']),
                   MdAR=np.median(results_df['ar']),
                   MdAR_SD=np.std(results_df['ar']))
    
    return results_df, results_agg
             

## AUC - Average Use-case size

In [7]:
dummy_series = pd.concat([pd.Series(np.zeros(use_cases_df.shape[0])), pd.Series(np.zeros(use_cases_df.shape[0]))], axis=1)

model = DummyRegressor(strategy='mean')
auc_res, auc_agg = validate_model_per_use_case(model, use_cases_df, X_data=dummy_series, y_data=use_cases_df['Cfp'],
                                             runs=runs, k=k, train_ids=train_ids, val_ids=val_ids)
    

auc_agg

Starting validation...
Starting run 1...
Staring fold 1 (run=1)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 2 (run=1)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 3 (run=1)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 4 (run=1)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 5 (run=1)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 6 (run=1)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...

Starting run 6...
Staring fold 1 (run=6)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 2 (run=6)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 3 (run=6)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 4 (run=6)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 5 (run=6)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 6 (run=6)...
Preparing training data...
Training dataset shape: (393, 2)
Fitting the model...
Validation dataset shape: (44, 2)
Predicting size...
Staring fold 7 (run=6)

{'MAR': 4.732388432741227,
 'MAR_SD': 4.147566186992142,
 'MdAR': 3.793893129770993,
 'MdAR_SD': 4.147566186992142}

In [8]:
auc_agg

{'MAR': 4.732388432741227,
 'MAR_SD': 4.147566186992142,
 'MdAR': 3.793893129770993,
 'MdAR_SD': 4.147566186992142}

In [14]:
marp0 = pd.read_excel(f'{output_folder}validation-marp0.xlsx')['MARP0'].tolist()[0]
SA_auc = (1 - auc_agg['MAR'] / marp0) * 100
f"SA={SA_auc:.2f}%"

'SA=25.64%'

In [18]:
auc_res.to_excel(f'{output_folder}validation-auc.xlsx')