# Compute performance metrics for the given Y and Y_score without sklearn

In [304]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'none'

In [305]:
import numpy as np
import pandas as pd

In [306]:
print (pd.options.display.max_columns)

2855


<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [307]:
!ls

5_a.csv
5_b.csv
5_c.csv
5_d.csv
5_Performance_metrics_Instructions.ipynb


In [308]:
df = pd.read_csv('5_a.csv')
print(df.shape)
print(df.head())

(10100, 2)
     y     proba
0  1.0  0.637387
1  1.0  0.635165
2  1.0  0.766586
3  1.0  0.724564
4  1.0  0.889199


In [309]:
"""
Given DataFrame column `y' contains float value, here we convert it to int
"""
def convert_to_int(df, col):
    
    df[col] = df[col].map(lambda y: int(y))
    
    return

In [310]:
convert_to_int(df, 'y')

df_top_k_rows = df.head(n = 3)

print(df_top_k_rows)

   y     proba
0  1  0.637387
1  1  0.635165
2  1  0.766586


> 1.1 Derive the class labels from given score

In [311]:
def derive_class_label(df, ref_col_name, new_col_name, thld):
    
    df[new_col_name] = df[ref_col_name].map(lambda prob: 1 if prob > thld else 0)
    
    return

In [312]:
derive_class_label(df, 'proba', 'y_hat', 0.5)
df_top_k_rows = df.head(n = 3)

print(df_top_k_rows)

   y     proba  y_hat
0  1  0.637387      1
1  1  0.635165      1
2  1  0.766586      1


> 1.2 Compute Confusion Matrix

In [313]:
InteractiveShell.ast_node_interactivity = 'none'

In [314]:
"""
confusion_mtx_ref: 
    key: tuple(original output, predicted output)
    
    value: dict({})
        name: element notation ( 'tp', 'fp', 'fn', 'tn')
        mtx_idx: Contains matrix indices, to get confusion matx element as per notation
"""

confusion_mtx_ref = {
    (1, 1): {'name': 'tp', 'mtx_idx': (0, 0)},
    (0, 1): {'name': 'fp', 'mtx_idx': (0, 1)},
    (1, 0): {'name': 'fn', 'mtx_idx': (1, 0)},
    (0, 0): {'name': 'tn', 'mtx_idx': (1, 1)}
}

print(confusion_mtx_ref)

{(1, 1): {'name': 'tp', 'mtx_idx': (0, 0)}, (0, 1): {'name': 'fp', 'mtx_idx': (0, 1)}, (1, 0): {'name': 'fn', 'mtx_idx': (1, 0)}, (0, 0): {'name': 'tn', 'mtx_idx': (1, 1)}}


In [315]:
def get_confusion_mtx_element_ref(confusion_mtx_ref):
    
    cf_mtx_elem_ref = dict()
    for (row, col) in confusion_mtx_ref:

        temp = confusion_mtx_ref.get((row, col))

        """
        """
        conf_mtx_elem_not = temp.get('name')

        """
        """
        conf_mtx_elem_index = temp.get('mtx_idx')

        cf_mtx_elem_ref.update({conf_mtx_elem_not: conf_mtx_elem_index})

    return cf_mtx_elem_ref

cf_mtx_elem_ref = get_confusion_mtx_element_ref(confusion_mtx_ref)
print(cf_mtx_elem_ref)

{'tp': (0, 0), 'fp': (0, 1), 'fn': (1, 0), 'tn': (1, 1)}


In [316]:
cf_mtx_elem_to_idx_mapping = {elem_nota_numeric: confusion_mtx_ref.get(elem_nota_numeric).get('mtx_idx') for elem_nota_numeric in confusion_mtx_ref}
print(cf_mtx_elem_to_idx_mapping)

{(1, 1): (0, 0), (0, 1): (0, 1), (1, 0): (1, 0), (0, 0): (1, 1)}


In [317]:
def get_confusion_mtx(series_i, series_ii, cf_mtx_elem_to_idx_mapping):
    
    confusion_mtx = np.zeros((2, 2), dtype = int)
    
    for index, item_y in enumerate(series_i):

        item_y_hat = series_ii.get(index)

        x, y = cf_mtx_elem_to_idx_mapping.get((item_y, item_y_hat))

        cf_item_count = confusion_mtx[x, y]

        confusion_mtx[x, y] = cf_item_count + 1

    return confusion_mtx

In [318]:
confusion_mtx = get_confusion_mtx(df['y'], df['y_hat'], cf_mtx_elem_to_idx_mapping)
print(confusion_mtx)

[[10000   100]
 [    0     0]]


> 1.3 Compute F1 Score

In [319]:
def compute_confusion_mtx_elements(confusion_mtx, cf_mtx_elem):
    """Compute confusin matrix elements i.e 
        {'tp', 'fp', 'fn', 'tn'}

    Parameters
    ----------
    confusion_mtx: numpy 2-D array
    cf_mtx_elem: dict()

    Return
    ------
    cf_mtx_elem_values: dict
        Contains mapping of confusion matrix element notation to their value (in confusion matrix itself)
        Structure: {
        'tp': 4, 'fp': 8, 'fn':5, 'tn': 7
        }
    """

    cf_mtx_elem_values = dict()
    for cf_mtx_elem_notation in cf_mtx_elem:
        """
        cf_mtx_elem_notation: string
            Confusion matrix element notation 
            Exp: {'tp', 'fp', 'fn', 'tn'}
        """

        x, y = cf_mtx_elem.get(cf_mtx_elem_notation)
        val = confusion_mtx[x, y]

        cf_mtx_elem_values.update({cf_mtx_elem_notation: val})


    return cf_mtx_elem_values

In [320]:
"""
precision = tp / (tp + fp)

recall = tp / (tp + fn)

F1 score = 2 * precision * recall / (precision + recall)
"""
def compute_tpr_fpr(cf_mtx_elem_values):
    
    tp = cf_mtx_elem_values.get('tp')
    fp = cf_mtx_elem_values.get('fp')
    fn = cf_mtx_elem_values.get('fn')
    tn = cf_mtx_elem_values.get('tn')
    
    tpr = tp / (tp + fn)
    
    fpr = fp / (fp + tn)
    
    return tpr, fpr

In [321]:
cf_mtx_elem_values = compute_confusion_mtx_elements(confusion_mtx, cf_mtx_elem_ref)

In [322]:
#def compute_precision_recall_f1(cf_mtx_elem_values):
def compute_auc_terms(cf_mtx_elem_values, auc_terms):
    
    auc_terms_ret = dict()
    auc_term_dict = dict()
    
    tp = cf_mtx_elem_values.get('tp')
    fp = cf_mtx_elem_values.get('fp')
    fn = cf_mtx_elem_values.get('fn')
    tn = cf_mtx_elem_values.get('tn')
    
    precision = tp / (tp + fp)
    
    recall = tp / (tp + fn)
    
    f1_score = 2 * precision * recall / (precision + recall)
    
    tpr = tp / (tp + fn)
    
    fpr = fp / (fp + tn)
    
    auc_term_dict.update({
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'tn': tn,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'tpr': tpr,
        'fpr': fpr
    })
    
    auc_terms_ret = {term: auc_term_dict.get(term) for term in auc_terms}
    
    return auc_terms_ret


auc_term_obj = compute_auc_terms(cf_mtx_elem_values, ['precision', 'recall', 'f1_score'])
print(auc_term_obj['precision'])
print(auc_term_obj['recall'])
print(auc_term_obj['f1_score'])

0.9900990099009901
1.0
0.9950248756218906


In [323]:
"""

auc_terms = compute_auc_terms(cf_mtx_elem_values)

print(auc_terms)
"""

> 1.3 AUC score

In [324]:
def get_unique_values(df, col_name, order = 'DESC'):
    
    unique_proba = df[col_name].unique()
    unique_proba.sort()
    
    if order == 'DESC':
        unique_proba = unique_proba[::-1]
        
    return unique_proba

In [325]:
unique_proba = get_unique_values(df, 'proba')

unique_proba_shape = unique_proba.shape

print(type(unique_proba))
print(unique_proba_shape)
print(unique_proba[:5])

<class 'numpy.ndarray'>
(10100,)
[0.89996535 0.89982831 0.89982485 0.89981181 0.89976788]


In [326]:
pd.set_option('display.max_columns', unique_proba_shape[0] + 3)

In [327]:
def compute_auc_prop(df, output_prob_col, unique_proba, auc_props_list):

    auc_props = dict()
    tpr, fpr = list(), list()
    for index, thld in enumerate(unique_proba):

        col_name, confusion_mtx_ii = '', ''
        cf_mtx_elem_values_ii, auc_term_obj = dict(), dict()

        col_name = "thld_{}".format(index)
        df[(col_name, thld)] = df[output_prob_col].map(lambda proba_val: 1 if proba_val > thld else 0)

        confusion_mtx_ii = get_confusion_mtx(df['y'], df[(col_name, thld)], cf_mtx_elem_to_idx_mapping)

        cf_mtx_elem_values_ii = compute_confusion_mtx_elements(confusion_mtx_ii, cf_mtx_elem_ref)

        auc_term_obj = compute_auc_terms(cf_mtx_elem_values_ii, auc_props_list)

        auc_props.update({(col_name, thld): auc_term_obj})

    return auc_props

In [328]:
def get_auc_curve_lists(auc_props, auc_props_list):
    
    prop_i_list, prop_ii_list = list(), list()
    for col_name in auc_props:
        
        prop_i = auc_props.get(col_name).get(auc_props_list[0])
        prop_ii = auc_props.get(col_name).get(auc_props_list[1])
        
        
        prop_i_list.append(prop_i)
        prop_ii_list.append(prop_ii)
        
    return prop_i_list, prop_ii_list

In [329]:
#auc_props = compute_auc_prop(df, 'proba', unique_proba, ['tpr', 'fpr'])

In [330]:
"""
tpr, fpr = get_auc_curve_lists(auc_props, ['tpr', 'fpr'])
print(tpr)
print(fpr)
"""

In [331]:
"""
auc_score = np.trapz(tpr, fpr)
print(auc_score)
"""

<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [332]:
# write your code
df_iii = pd.read_csv("5_c.csv")

print(df_iii.shape)
print(df_iii.head())

(2852, 2)
   y      prob
0  0  0.458521
1  0  0.505037
2  0  0.418652
3  0  0.412057
4  0  0.375579


In [333]:
convert_to_int(df_iii, 'y')

In [334]:
unique_proba_iii = get_unique_values(df_iii, 'prob')

print(unique_proba_iii.shape)
print(unique_proba_iii[:5])

(2791,)
[0.9577468  0.95143692 0.94863779 0.94409361 0.94111318]


In [335]:
pd.set_option('display.max_columns', df_iii.shape[0] + 3)

In [336]:
auc_props_iii = compute_auc_prop(df_iii, 'prob', unique_proba_iii, ['fn', 'fp'])

  if sys.path[0] == '':


In [337]:
print(auc_props_iii)

{('thld_0', 0.9577467989277196): {'fn': 1047, 'fp': 0}, ('thld_1', 0.9514369163158778): {'fn': 1046, 'fp': 0}, ('thld_2', 0.9486377939984604): {'fn': 1045, 'fp': 0}, ('thld_3', 0.9440936134070964): {'fn': 1044, 'fp': 0}, ('thld_4', 0.9411131844327256): {'fn': 1043, 'fp': 0}, ('thld_5', 0.9216107669714336): {'fn': 1042, 'fp': 0}, ('thld_6', 0.918113140842399): {'fn': 1041, 'fp': 0}, ('thld_7', 0.9163642710427174): {'fn': 1040, 'fp': 0}, ('thld_8', 0.9133748371858854): {'fn': 1039, 'fp': 0}, ('thld_9', 0.9132795051948444): {'fn': 1038, 'fp': 0}, ('thld_10', 0.9124637897554282): {'fn': 1037, 'fp': 0}, ('thld_11', 0.9106220246569532): {'fn': 1036, 'fp': 0}, ('thld_12', 0.9093609705970094): {'fn': 1035, 'fp': 0}, ('thld_13', 0.9080672803762796): {'fn': 1034, 'fp': 0}, ('thld_14', 0.9076528961996776): {'fn': 1032, 'fp': 0}, ('thld_15', 0.9015598740126636): {'fn': 1031, 'fp': 0}, ('thld_16', 0.9001968241221197): {'fn': 1030, 'fp': 0}, ('thld_17', 0.8999374043699857): {'fn': 1029, 'fp': 0}, ('

In [230]:
#print(df_iii.shape)
#('thld_0', 0.9577467989277196)

(2852, 2793)


In [233]:
#print(df_iii.iloc[:,0])

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
2822    1
2823    1
2824    1
2825    1
2826    1
2827    1
2828    1
2829    1
2830    1
2831    1
2832    1
2833    1
2834    1
2835    1
2836    1
2837    1
2838    1
2839    1
2840    1
2841    1
2842    1
2843    1
2844    1
2845    1
2846    1
2847    1
2848    1
2849    1
2850    1
2851    1
Name: y, Length: 2852, dtype: int64


In [235]:
#print(df_iii[('thld_0', 0.9577467989277196)])
#a_iii = get_confusion_mtx(df_iii['y'], df_iii[('thld_0', 0.9577467989277196)], cf_mtx_elem_to_idx_mapping)
#print(a_iii)

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
2822    0
2823    0
2824    0
2825    0
2826    0
2827    0
2828    0
2829    0
2830    0
2831    0
2832    0
2833    0
2834    0
2835    0
2836    0
2837    0
2838    0
2839    0
2840    0
2841    0
2842    0
2843    0
2844    0
2845    0
2846    0
2847    0
2848    0
2849    0
2850    0
2851    0
Name: (thld_0, 0.9577467989277196), Length: 2852, dtype: int64
[[   0    0]
 [1047 1805]]


In [236]:
#answer = compute_confusion_mtx_elements(a_iii, cf_mtx_elem_ref)
#print(answer)

{'tp': 0, 'fp': 0, 'fn': 1047, 'tn': 1805}


In [344]:
print(auc_props_iii)

{('thld_0', 0.9577467989277196): {'fn': 1047, 'fp': 0}, ('thld_1', 0.9514369163158778): {'fn': 1046, 'fp': 0}, ('thld_2', 0.9486377939984604): {'fn': 1045, 'fp': 0}, ('thld_3', 0.9440936134070964): {'fn': 1044, 'fp': 0}, ('thld_4', 0.9411131844327256): {'fn': 1043, 'fp': 0}, ('thld_5', 0.9216107669714336): {'fn': 1042, 'fp': 0}, ('thld_6', 0.918113140842399): {'fn': 1041, 'fp': 0}, ('thld_7', 0.9163642710427174): {'fn': 1040, 'fp': 0}, ('thld_8', 0.9133748371858854): {'fn': 1039, 'fp': 0}, ('thld_9', 0.9132795051948444): {'fn': 1038, 'fp': 0}, ('thld_10', 0.9124637897554282): {'fn': 1037, 'fp': 0}, ('thld_11', 0.9106220246569532): {'fn': 1036, 'fp': 0}, ('thld_12', 0.9093609705970094): {'fn': 1035, 'fp': 0}, ('thld_13', 0.9080672803762796): {'fn': 1034, 'fp': 0}, ('thld_14', 0.9076528961996776): {'fn': 1032, 'fp': 0}, ('thld_15', 0.9015598740126636): {'fn': 1031, 'fp': 0}, ('thld_16', 0.9001968241221197): {'fn': 1030, 'fp': 0}, ('thld_17', 0.8999374043699857): {'fn': 1029, 'fp': 0}, ('

In [364]:
def get_lowest_mtx_param(auc_props):
    
    mtx_score = dict()
    for thld_col in auc_props:

        fn = auc_props.get(thld_col).get('fn')
        fp = auc_props.get(thld_col).get('fp')
        
        score = 500 * fn + 100 * fp
        
        mtx_score.update({thld_col: score})
    
    return mtx_score

In [365]:
mtx_score = get_lowest_mtx_param(auc_props_iii)

In [366]:
key_min = min(mtx_score.keys(), key=(lambda k: mtx_score[k]))
print(mtx_score[key_min])

141000


In [342]:
 # write your code

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>