# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'none'

In [2]:
import numpy as np
import pandas as pd

In [3]:
print (pd.options.display.max_columns)

20


<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [4]:
!ls

5_a.csv
5_b.csv
5_c.csv
5_d.csv
5_Performance_metrics_Instructions.ipynb


In [5]:
df = pd.read_csv('5_a.csv')
print(df.shape)
print(df.head())

(10100, 2)
     y     proba
0  1.0  0.637387
1  1.0  0.635165
2  1.0  0.766586
3  1.0  0.724564
4  1.0  0.889199


In [6]:
"""
Given DataFrame column `y' contains float value, here we convert it to int
"""

df['y'] = df['y'].map(lambda y: int(y))
df_top_k_rows = df.head(n = 3)

print(df_top_k_rows)

   y     proba
0  1  0.637387
1  1  0.635165
2  1  0.766586


> 1.1 Derive the class labels from given score

In [7]:
df['y_hat'] = df['proba'].map(lambda prob: 1 if prob > 0.5 else 0)
df_top_k_rows = df.head(n = 3)

print(df_top_k_rows)

   y     proba  y_hat
0  1  0.637387      1
1  1  0.635165      1
2  1  0.766586      1


> 1.2 Compute Confusion Matrix

In [8]:
InteractiveShell.ast_node_interactivity = 'none'

In [9]:
"""
confusion_mtx_ref: 
    key: tuple(original output, predicted output)
    
    value: dict({})
        name: element notation ( 'tp', 'fp', 'fn', 'tn')
        mtx_idx: Contains matrix indices, to get confusion matx element as per notation
"""

confusion_mtx_ref = {
    (1, 1): {'name': 'tp', 'mtx_idx': (0, 0)},
    (0, 1): {'name': 'fp', 'mtx_idx': (0, 1)},
    (1, 0): {'name': 'fn', 'mtx_idx': (1, 0)},
    (0, 0): {'name': 'tn', 'mtx_idx': (1, 1)}
}

print(confusion_mtx_ref)

{(1, 1): {'name': 'tp', 'mtx_idx': (0, 0)}, (0, 1): {'name': 'fp', 'mtx_idx': (0, 1)}, (1, 0): {'name': 'fn', 'mtx_idx': (1, 0)}, (0, 0): {'name': 'tn', 'mtx_idx': (1, 1)}}


In [10]:
def get_confusion_mtx_element_ref(confusion_mtx_ref):
    
    cf_mtx_elem_ref = dict()
    for (row, col) in confusion_mtx_ref:

        temp = confusion_mtx_ref.get((row, col))

        """
        """
        conf_mtx_elem_not = temp.get('name')

        """
        """
        conf_mtx_elem_index = temp.get('mtx_idx')

        cf_mtx_elem_ref.update({conf_mtx_elem_not: conf_mtx_elem_index})

    return cf_mtx_elem_ref

cf_mtx_elem_ref = get_confusion_mtx_element_ref(confusion_mtx_ref)
print(cf_mtx_elem_ref)

{'tp': (0, 0), 'fp': (0, 1), 'fn': (1, 0), 'tn': (1, 1)}


In [11]:
cf_mtx_elem_to_idx_mapping = {elem_nota_numeric: confusion_mtx_ref.get(elem_nota_numeric).get('mtx_idx') for elem_nota_numeric in confusion_mtx_ref}
print(cf_mtx_elem_to_idx_mapping)

{(1, 1): (0, 0), (0, 1): (0, 1), (1, 0): (1, 0), (0, 0): (1, 1)}


In [12]:
def get_confusion_mtx(series_i, series_ii, cf_mtx_elem_to_idx_mapping):
    
    confusion_mtx = np.zeros((2, 2), dtype = int)
    
    for index, item_y in enumerate(series_i):

        item_y_hat = series_ii.get(index)

        x, y = cf_mtx_elem_to_idx_mapping.get((item_y, item_y_hat))

        cf_item_count = confusion_mtx[x, y]

        confusion_mtx[x, y] = cf_item_count + 1

    return confusion_mtx

In [13]:
confusion_mtx = get_confusion_mtx(df['y'], df['y_hat'], cf_mtx_elem_to_idx_mapping)
print(confusion_mtx)

[[10000   100]
 [    0     0]]


> 1.3 Compute F1 Score

In [14]:
def compute_confusion_mtx_elements(confusion_mtx, cf_mtx_elem):
    """Compute confusin matrix elements i.e 
        {'tp', 'fp', 'fn', 'tn'}

    Parameters
    ----------
    confusion_mtx: numpy 2-D array
    cf_mtx_elem: dict()

    Return
    ------
    cf_mtx_elem_values: dict
        Contains mapping of confusion matrix element notation to their value (in confusion matrix itself)
        Structure: {
        'tp': 4, 'fp': 8, 'fn':5, 'tn': 7
        }
    """

    cf_mtx_elem_values = dict()
    for cf_mtx_elem_notation in cf_mtx_elem:
        """
        cf_mtx_elem_notation: string
            Confusion matrix element notation 
            Exp: {'tp', 'fp', 'fn', 'tn'}
        """

        x, y = cf_mtx_elem.get(cf_mtx_elem_notation)
        val = confusion_mtx[x, y]

        cf_mtx_elem_values.update({cf_mtx_elem_notation: val})


    return cf_mtx_elem_values

In [15]:
"""
precision = tp / (tp + fp)

recall = tp / (tp + fn)

F1 score = 2 * precision * recall / (precision + recall)
"""
def compute_tpr_fpr(cf_mtx_elem_values):
    
    tp = cf_mtx_elem_values.get('tp')
    fp = cf_mtx_elem_values.get('fp')
    fn = cf_mtx_elem_values.get('fn')
    tn = cf_mtx_elem_values.get('tn')
    
    tpr = tp / (tp + fn)
    
    fpr = fp / (fp + tn)
    
    return tpr, fpr

In [None]:
def compute_precision_recall_f1(cf_mtx_elem_values):
    
    tp = cf_mtx_elem_values.get('tp')
    fp = cf_mtx_elem_values.get('fp')
    fn = cf_mtx_elem_values.get('fn')
    
    precision = tp / (tp + fp)
    
    recall = tp / (tp + fn)
    
    f1_score = 2 * precision * recall / (precision + recall)
    
    return precision, recall, f1_score


precision, recall, f1_score = compute_precision_recall_f1(cf_mtx_elem_values)
print(precision)
print(recall)
print(f1_score)

In [16]:
cf_mtx_elem_values = compute_confusion_mtx_elements(confusion_mtx, cf_mtx_elem_ref)
auc_terms = compute_auc_terms(cf_mtx_elem_values)

print(auc_terms)

{'precision': 0.9900990099009901, 'recall': 1.0, 'f1_score': 0.9950248756218906, 'tpr': 1.0, 'fpr': 1.0}


> 1.3 AUC score

In [17]:
unique_proba = df['proba'].unique()
unique_proba.sort()

unique_proba = unique_proba[::-1]

unique_proba_shape = unique_proba.shape

print(type(unique_proba))
print(unique_proba_shape)
print(unique_proba[:5])

<class 'numpy.ndarray'>
(10100,)
[0.50001859 0.50004734 0.50005801 0.50005815 0.50008126]


In [18]:
pd.set_option('display.max_columns', unique_proba_shape[0] + 2)

In [None]:
auc_terms = dict()
tpr, fpr = list(), list()
for index, thld in enumerate(unique_proba):
    
    col_name, confusion_mtx_ii = '', ''
    cf_mtx_elem_values_ii, auc_terms_ii = dict(), dict()
    
    col_name = "thld_{}".format(index)
    df[(col_name, thld)] = df['proba'].map(lambda proba_val: 1 if proba_val > thld else 0)
    
    confusion_mtx_ii = get_confusion_mtx(df['y'], df[(col_name, thld)], cf_mtx_elem_to_idx_mapping)
    
    cf_mtx_elem_values_ii = compute_confusion_mtx_elements(confusion_mtx_ii, cf_mtx_elem_ref)
    
    
    tpr_temp, fpr_temp = compute_tpr_fpr(cf_mtx_elem_values_ii)
    
    auc_terms.update({(col_name, thld): {'tpr': tpr_temp, 'fpr': fpr_temp }})
    
    tpr.append(tpr_temp)
    fpr.append(fpr_temp)

In [21]:
auc_score = np.trapz(tpr, fpr)
print(auc_score)

-0.48829900000000004


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [23]:
# write your code
df_iii = pd.read_csv("5_c.csv")
print(df_iii.head())

   y      prob
0  0  0.458521
1  0  0.505037
2  0  0.418652
3  0  0.412057
4  0  0.375579


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [0]:
 # write your code

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>