# Compute performance metrics for the given Y and Y_score without sklearn

In [37]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages


## A. Compute performance metrics for the given data '5_a.csv'
 <pre>  <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [87]:
df_a=pd.read_csv('5_a.csv')
df_a.head(20)

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199
5,1.0,0.6016
6,1.0,0.666323
7,1.0,0.567012
8,1.0,0.65023
9,1.0,0.829346


In [88]:
df_a[df_a['proba'] < 0.5]

Unnamed: 0,y,proba


In [89]:
# write your code here for task A
def give_class(data, threshold):
    if data['proba'] < threshold:
        return 0
    else:
        return 1
        
df_a['y_pred'] = df_a.apply(lambda x: give_class(x, 0.5), axis=1)

In [90]:
df_a[df_a['y_pred'] == 0]

Unnamed: 0,y,proba,y_pred


In [91]:
def conf_matrix(df,y,y_pred):
    tp=0
    tn=0
    fn=0
    fp=0
    for i in range(len(y)):
        if df.y_pred[i]==1 and df.y[i]==1:
            tp=tp+1
        if df.y_pred[i]==0 and df.y[i]==0:
            tn=tn+1
        if df.y_pred[i]==0 and df.y[i]==1:
            fn=fn+1
        if df.y_pred[i]==1 and df.y[i]==0:
            fp=fp+1
    return {'tn':tn,'tp':tp,'fn':fn,'fp':fp}

confusion_matrix = conf_matrix(df_a, df_a['y'], df_a['y_pred'])

In [92]:
confusion_matrix

{'tn': 0, 'tp': 10000, 'fn': 0, 'fp': 100}

In [93]:
#F1 score:
precision = confusion_matrix['tp'] /(confusion_matrix['tp']+confusion_matrix['fp'])
recall = confusion_matrix['tp']/(confusion_matrix['fn']+confusion_matrix['tp'])

f1Score= 2 * ((precision*recall) / (precision+recall))
print("F1-Score is: ", f1Score)

F1-Score is:  0.9950248756218906


In [110]:
thresholds = list(np.linspace(0,1,101,endpoint=False))
thresholds.sort(reverse=True)
print(thresholds)

[0.9900990099009901, 0.9801980198019802, 0.9702970297029703, 0.9603960396039604, 0.9504950495049505, 0.9405940594059407, 0.9306930693069307, 0.9207920792079208, 0.9108910891089109, 0.900990099009901, 0.8910891089108911, 0.8811881188118812, 0.8712871287128713, 0.8613861386138614, 0.8514851485148515, 0.8415841584158416, 0.8316831683168316, 0.8217821782178218, 0.8118811881188119, 0.801980198019802, 0.7920792079207921, 0.7821782178217822, 0.7722772277227723, 0.7623762376237624, 0.7524752475247525, 0.7425742574257426, 0.7326732673267327, 0.7227722772277227, 0.7128712871287128, 0.7029702970297029, 0.6930693069306931, 0.6831683168316832, 0.6732673267326733, 0.6633663366336634, 0.6534653465346535, 0.6435643564356436, 0.6336633663366337, 0.6237623762376238, 0.6138613861386139, 0.6039603960396039, 0.594059405940594, 0.5841584158415841, 0.5742574257425742, 0.5643564356435644, 0.5544554455445545, 0.5445544554455446, 0.5346534653465347, 0.5247524752475248, 0.5148514851485149, 0.504950495049505, 0.4

In [111]:
def calc_auc(df_a):
    tpr = []
    fpr = []
    for i in thresholds:
        df_a['y_pred'] = df_a.apply(lambda x: give_class(x, i), axis=1)
        confusion_matrix = conf_matrix(df_a, df_a['y'], df_a['y_pred'])
        tpx = confusion_matrix['tp']/(confusion_matrix['fn']+confusion_matrix['tp'])
        fpx = confusion_matrix['fp']/(confusion_matrix['tn']+confusion_matrix['fp'])
        tpr.append(tpx)
        fpr.append(fpx)
        df_a.drop(columns=['y_pred'])
    return np.trapz(tpr, fpr)

In [112]:
data_a=df_a.sort_values(by='proba',ascending=False)
data_a.drop(columns=['y_pred'])
auc = calc_auc(data_a)

In [113]:
auc

0.488382

In [114]:
print("Accuracy score: ", (confusion_matrix['fp']+confusion_matrix['tp'])/(confusion_matrix['fn']+confusion_matrix['tp']+confusion_matrix['fp']+confusion_matrix['tn']))

Accuracy score:  1.0




## B. Compute performance metrics for the given data '5_b.csv'
<pre>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a>
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [115]:
df_b=pd.read_csv('5_b.csv')
df_b.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [117]:
# write your code here for task B
df_b['y_pred'] = df_b.apply(lambda x: give_class(x, 0.5), axis=1)
confusion_matrix = conf_matrix(df_b, df_b['y'], df_b['y_pred'])
confusion_matrix

{'tn': 9761, 'tp': 55, 'fn': 45, 'fp': 239}

In [118]:
#F1 score:
precision = confusion_matrix['tp'] /(confusion_matrix['tp']+confusion_matrix['fp'])
recall = confusion_matrix['tp']/(confusion_matrix['fn']+confusion_matrix['tp'])

f1Score= 2 * ((precision*recall) / (precision+recall))
print("F1-Score is: ", f1Score)

F1-Score is:  0.2791878172588833


In [120]:
print("AUC score is ",calc_auc(df_b))

AUC score is  0.9382005


In [121]:
print("Accuracy score: ", (confusion_matrix['fp']+confusion_matrix['tp'])/(confusion_matrix['fn']+confusion_matrix['tp']+confusion_matrix['fp']+confusion_matrix['tn']))

Accuracy score:  0.02910891089108911


### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [122]:
df_c=pd.read_csv('5_c.csv')
df_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [131]:
def give_class(data, threshold):
    if data['prob'] < threshold:
        return 0
    else:
        return 1

In [132]:
 # write your code for task C
def calc_metric(df_a):
    metrics = {}
    for i in thresholds:
        df_a['y_pred'] = df_a.apply(lambda x: give_class(x, i), axis=1)
        confusion_matrix = conf_matrix(df_a, df_a['y'], df_a['y_pred'])
        metric_val = (500*confusion_matrix['fn'])+(100*confusion_matrix['fp'])
        metrics[i] = metric_val
        df_a.drop(columns=['y_pred'])
    return metrics

In [133]:
get_min = calc_metric(df_c)


0.9900990099009901

In [138]:
temp = min(get_min.values())

In [139]:
res = [key for key in get_min if get_min[key] == temp]
print(res, temp)

[0.22772277227722773] 141600



## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [185]:
df_d=pd.read_csv('5_d.csv')
df_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [186]:
df_d['error'] = df_d['y'] - df_d['pred']

In [187]:
def ss_res(df,col):
    val=0
    for value in df[col]:
        val=val+(value*value)
    return val

In [188]:
ans = ss_res(df_d, 'error')

In [189]:
ans

27850448.0

In [190]:
y_bar = df_d['y'].mean()

In [191]:
def ss_total(df,col):
    val=0
    for value in df[col]:
        val=val+(value - y_bar)**2
    return val

In [192]:
denm = ss_total(df_d, 'y')

In [193]:
denm

638161080.035662

In [194]:
df_d['mape_val'] = abs(df_d['error'] / df_d['y'].mean())
df_d.head(10)

Unnamed: 0,y,pred,error,mape_val
0,101.0,100.0,1.0,0.015024
1,120.0,100.0,20.0,0.300471
2,131.0,113.0,18.0,0.270424
3,164.0,125.0,39.0,0.585919
4,154.0,152.0,2.0,0.030047
5,133.0,153.0,-20.0,0.300471
6,148.0,139.0,9.0,0.135212
7,172.0,145.0,27.0,0.405636
8,153.0,162.0,-9.0,0.135212
9,162.0,154.0,8.0,0.120189


In [195]:
print("Mean squared error is ", ans/(len(df_d['y'])))

Mean squared error is  177.16569974554707


In [196]:
print("MAPE of the given data: ", sum(df_d['mape_val'])/len(df_d['y']))

MAPE of the given data:  0.12912029940096315


In [197]:
r2 = 1 - (ans / denm)
print("R2 Score: ", r2)

R2 Score:  0.9563582786990964
