# Compute performance metrics for the given Y and Y_score without sklearn

In [52]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [53]:
import numpy as np
import pandas as pd

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [54]:
!ls

5_a.csv
5_b.csv
5_c.csv
5_d.csv
5_Performance_metrics_Instructions.ipynb


In [55]:
df = pd.read_csv('5_a.csv')
df.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [56]:
"""
Given DataFrame column `y' contains float value, here we convert it to int
"""

df['y'] = df['y'].map(lambda y: int(y))
df_top_k_rows = df.head(n = 3)

print(df_top_k_rows)

"\nGiven DataFrame column `y' contains float value, here we convert it to int\n"

   y     proba
0  1  0.637387
1  1  0.635165
2  1  0.766586


> 1.1 Derive the class labels from given score

In [57]:
df['y_hat'] = df['proba'].map(lambda prob: 1 if prob > 0.5 else 0)
df_top_k_rows = df.head(n = 3)

print(df_top_k_rows)

   y     proba  y_hat
0  1  0.637387      1
1  1  0.635165      1
2  1  0.766586      1


> 1.2 Compute Confusion Matrix

In [58]:
InteractiveShell.ast_node_interactivity = 'none'

In [59]:
"""
confusion_mtx_ref: 
    key: tuple(original output, predicted output)
    
    value: dict({})
        name: element notation ( 'tp', 'fp', 'fn', 'tn')
        mtx_idx: Contains matrix indices, to get confusion matx element as per notation
"""

confusion_mtx_ref = {
    (1, 1): {'name': 'tp', 'mtx_idx': (0, 0)},
    (0, 1): {'name': 'fp', 'mtx_idx': (0, 1)},
    (1, 0): {'name': 'fn', 'mtx_idx': (1, 0)},
    (0, 0): {'name': 'tn', 'mtx_idx': (1, 1)}
}

print(confusion_mtx_ref)

{(1, 1): {'name': 'tp', 'mtx_idx': (0, 0)}, (0, 1): {'name': 'fp', 'mtx_idx': (0, 1)}, (1, 0): {'name': 'fn', 'mtx_idx': (1, 0)}, (0, 0): {'name': 'tn', 'mtx_idx': (1, 1)}}


In [60]:
InteractiveShell.ast_node_interactivity = 'all'

In [61]:
cf_mtx_elem = dict()
for (row, col) in confusion_mtx_ref:
    
    temp = confusion_mtx_ref.get((row, col))
    
    """
    """
    conf_mtx_elem_not = temp.get('name')
    
    """
    """
    conf_mtx_elem_index = temp.get('mtx_idx')
    
    cf_mtx_elem.update({conf_mtx_elem_not: conf_mtx_elem_index})
    
print(cf_mtx_elem)

'\nF1 score = 2 * precision * recall / (precision + recall)\n\nprecision = tp / (tp + fp)\n\nrecall = tp / (tp + fn)\n'

{'tp': (0, 0), 'fp': (0, 1), 'fn': (1, 0), 'tn': (1, 1)}


In [62]:
confusion_mtx = np.zeros((2, 2), dtype = int)
print(confusion_mtx)

[[0 0]
 [0 0]]


In [63]:
for index, item_y in enumerate(df['y']):

    item_y_hat = df['y_hat'].get(index)
    
    x, y = confusion_mtx_ref.get((item_y, item_y_hat)).get('mtx_idx')
    
    cf_item_count = confusion_mtx[x, y]
    
    confusion_mtx[x, y] = cf_item_count + 1

print(confusion_mtx)

[[10000   100]
 [    0     0]]


> 1.3 Compute F1 Score

In [67]:
"""
F1 score = 2 * precision * recall / (precision + recall)

precision = tp / (tp + fp)

recall = tp / (tp + fn)
"""

"""
cf_mtx_elem_values: dict
    Contains mapping of confusion matrix element notation to their value (in confusion matrix itself)
    Structure: {
    'tp': 4, 'fp': 8, 'fn':5, 'tn': 7
    }
"""
cf_mtx_elem_values = dict()
for cf_mtx_elem_notation in cf_mtx_elem:
    """
    cf_mtx_elem_notation: string
        Confusion matrix element notation 
        Exp: {'tp', 'fp', 'fn', 'tn'}
    """
    
    x, y = cf_mtx_elem.get(cf_mtx_elem_notation)
    val = confusion_mtx[x, y]
    
    cf_mtx_elem_values.update({cf_mtx_elem_notation: val})
    
precision = cf_mtx_elem_values.get('tp') / (cf_mtx_elem_values.get('tp') + cf_mtx_elem_values.get('fp'))
recall = cf_mtx_elem_values.get('tp') / (cf_mtx_elem_values.get('tp') + cf_mtx_elem_values.get('fn'))

print(precision)
print(recall)


'\nF1 score = 2 * precision * recall / (precision + recall)\n\nprecision = tp / (tp + fp)\n\nrecall = tp / (tp + fn)\n'

0.9900990099009901
1.0


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [0]:
# write your code

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [0]:
 # write your code

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>