# Compute performance metrics for the given Y and Y_score without sklearn

In [71]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [72]:
# write your code here
data = pd.read_csv('5_a.csv')

data['y_pred'] = np.where(data['proba'] < 0.5, 0, 1)

tp = len(data[(data.y==1) & (data.y_pred==1)])
fp = len(data[(data.y==0) & (data.y_pred==1)])
fn = len(data[(data.y==1) & (data.y_pred==0)])
tn = len(data[(data.y==0) & (data.y_pred==0)])

confusion_matrix = pd.DataFrame([[tp, fp],[fn, tn]], columns=['actual +ve', 'actual -ve'], index=['predicted +ve', 'predicted -ve'])
confusion_matrix

Unnamed: 0,actual +ve,actual -ve
predicted +ve,10000,100
predicted -ve,0,0


In [73]:
precision = tp/(tp+fp)
recall = tp/(tp+fn)

f1_score = (2*precision*recall)/(precision+recall)

print('F1 Score = ', f1_score)

F1 Score =  0.9950248756218906


In [74]:
from tqdm import tqdm
def tpr_fpr(data, th):
    data['y_pred'] = np.where(data['proba'] < th, 0, 1)
    tp = len(data[(data.y==1) & (data.y_pred==1)])
    fp = len(data[(data.y==0) & (data.y_pred==1)])
    fn = len(data[(data.y==1) & (data.y_pred==0)])
    tn = len(data[(data.y==0) & (data.y_pred==0)])
    return tp/(tp+fn), fp/(tn+fp)
    
thresholds = data.proba.unique()
thresholds.sort()

tpr_list = []
fpr_list = []
for th in tqdm(thresholds):
    tpr, fpr = tpr_fpr(data, th)
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    
# since fpr_list (which will be on x_axis) is in decreasing order, so we need to make it in increasing order
tpr_list.reverse()
fpr_list.reverse()

AUC_Score = np.trapz(np.array(tpr_list), np.array(fpr_list))
print('AUS Score is: ', AUC_Score)

100%|███████████████████████████████████████████████████████████████████████████| 10100/10100 [01:01<00:00, 163.43it/s]


AUS Score is:  0.48829900000000004


In [62]:
accuracy = (tp+tn) / (tp+tn+fp+fn)
print('Accuracy: ', accuracy)

Accuracy:  0.9900990099009901


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [63]:
# write your code here
data = pd.read_csv('5_b.csv')

data['y_pred'] = np.where(data['proba'] < 0.5, 0, 1)

tp = len(data[(data.y==1) & (data.y_pred==1)])
fp = len(data[(data.y==0) & (data.y_pred==1)])
fn = len(data[(data.y==1) & (data.y_pred==0)])
tn = len(data[(data.y==0) & (data.y_pred==0)])

confusion_matrix = pd.DataFrame([[tp, fp],[fn, tn]], columns=['actual +ve', 'actual -ve'], index=['predicted +ve', 'predicted -ve'])
confusion_matrix

Unnamed: 0,actual +ve,actual -ve
predicted +ve,55,239
predicted -ve,45,9761


In [64]:
precision = tp/(tp+fp)
recall = tp/(tp+fn)

f1_score = (2*precision*recall)/(precision+recall)

print('F1 Score = ', f1_score)

F1 Score =  0.2791878172588833


In [65]:
from tqdm import tqdm
def tpr_fpr(data, th):
    data['y_pred'] = np.where(data['proba'] < th, 0, 1)
    tp = len(data[(data.y==1) & (data.y_pred==1)])
    fp = len(data[(data.y==0) & (data.y_pred==1)])
    fn = len(data[(data.y==1) & (data.y_pred==0)])
    tn = len(data[(data.y==0) & (data.y_pred==0)])
    return tp/(tp+fn), fp/(tn+fp)
    
thresholds = data.proba.unique()
thresholds.sort()

tpr_list = []
fpr_list = []
for th in tqdm(thresholds):
    tpr, fpr = tpr_fpr(data, th)
    tpr_list.append(tpr)
    fpr_list.append(fpr)

# since fpr_list (which will be on x_axis) is in decreasing order, so we need to make it in increasing order
tpr_list.reverse()
fpr_list.reverse()

AUC_Score = np.trapz(tpr_list, fpr_list)
print('AUS Score is: ', AUC_Score)

100%|███████████████████████████████████████████████████████████████████████████| 10100/10100 [01:05<00:00, 153.45it/s]


AUS Score is:  0.9377570000000001


In [66]:
accuracy = (tp+tn) / (tp+tn+fp+fn)
print('Accuracy: ', accuracy)

Accuracy:  0.9718811881188119


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [67]:
data = pd.read_csv('5_c.csv')

thresholds = data.prob.unique()
thresholds.sort()

A = []
for th in tqdm(thresholds):
    data['y_pred'] = np.where(data['prob'] < th, 0, 1)
    fp = len(data[(data.y==0) & (data.y_pred==1)])
    fn = len(data[(data.y==1) & (data.y_pred==0)])
    A.append( 500*fn + 100*fp )
    
best_th = thresholds[A.index(min(A))]
print('Best threshold is: ', best_th)

100%|█████████████████████████████████████████████████████████████████████████████| 2791/2791 [00:09<00:00, 292.04it/s]


Best threshold is:  0.2300390278970873


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [68]:
data = pd.read_csv('5_d.csv')
print(data.head())
mse = np.mean((data.y - data.pred)**2)
print('MSE: ', mse)

       y   pred
0  101.0  100.0
1  120.0  100.0
2  131.0  113.0
3  164.0  125.0
4  154.0  152.0
MSE:  177.16569974554707


### $Modefied\ Mean\ Absolute\ Percentage\ Error\ is:\\ M\_MAPE = \frac{\sum_{k=1}^N |e_i|}{\sum_{k=1}^N |a_i|}$

In [69]:
mape = np.sum(np.abs(data.y - data.pred)) / np.sum(data.y)
print('MAPE: ', mape)

MAPE:  0.1291202994009687


### $R^2 = 1 - \frac{SS_{res}}{SS_{tot}} = 1 - \frac{SS_{res}}{SS_{res} + SS_{reg}}  = 1 - \frac{\sum_{i}^n (y_i - f_i)^2}{\sum_{i}^n (y_i - f_i)^2 + \sum_{i}^n (y_i - \bar y)^2}$

In [70]:
ss_reg = np.sum( (data.y - np.mean(data.y))**2 )
ss_res = np.sum( (data.y - data.pred)**2 )
ss_tot = ss_reg + ss_res
r_square = 1 - (ss_res / ss_tot)
print('R_square: ', r_square)

R_square:  0.9581832343320785
