# Compute performance metrics for the given Y and Y_score without sklearn

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
df_a=pd.read_csv('5_a.csv')
df_a.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [None]:
def cm_scores(df1,df2,a,ps):
    tp=df2[(df1[a]==1) & (df2[ps]==1)].count()[0]
    tn=df2[(df1[a]==0) & (df2[ps]==0)].count()[0]
    fp=df2[(df1[a]==0) & (df2[ps]==1)].count()[0]
    fn=df2[(df1[a]==1) & (df2[ps]==0)].count()[0]
    return tp,tn,fp,fn

In [None]:
def confusion_matrix(df,a,ps):
    x=pd.DataFrame(data=df[ps])
    x[x[ps]<0.5]=0
    x[x[ps]>0.5]=1
    tp,tn,fp,fn=cm_scores(df,x,a,ps)
    cm=np.array([tn,fn,fp,tp])
    cm.shape=(2,2)
    return cm

In [None]:
def scores(cm):
    accuracy_score = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
    precision = cm[1][1]/(cm[1][1]+cm[1][0])
    recall = cm[1][1]/(cm[1][1]+cm[0][1])
    f1_score = 2*((precision*recall)/(precision+recall))
    return f1_score,accuracy_score

In [None]:
def auc_score(df,a,ps):
    tpr=[]
    fpr=[]
    df1=pd.DataFrame(data=df)
    df1.sort_values(by=[ps],ascending=False,inplace=True)
    for i in tqdm(list(df1[ps].values)):
        x=pd.DataFrame(data=df1[ps])
        x[x[ps]<i]=0
        x[x[ps]>=i]=1
        tp,tn,fp,fn=cm_scores(df1,x,a,ps)
        tpr.append(tp/(tp+fn))
        fpr.append(fp/(fp+tn))
    return np.trapz(tpr,fpr)

In [None]:
cmr=confusion_matrix(df_a,'y','proba')
print('-'*80)
print('confusion matrix\n')
print(cmr)
print('-'*80)

--------------------------------------------------------------------------------
confusion matrix

[[    0     0]
 [  100 10000]]
--------------------------------------------------------------------------------


In [None]:
f1,asc=scores(cmr)
print('-'*80)
print('f1-score : {}'.format(f1))
print('Accuracy Score : {}'.format(asc))
print('-'*80)

--------------------------------------------------------------------------------
f1-score : 0.9950248756218906
Accuracy Score : 0.9900990099009901
--------------------------------------------------------------------------------


In [None]:
AUC=auc_score(df_a,'y','proba')
print('-'*80)
print('AUC Score : {}'.format(AUC))
print('-'*80)

100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [02:47<00:00, 60.19it/s]


--------------------------------------------------------------------------------
AUC Score : 0.48829900000000004
--------------------------------------------------------------------------------


In [None]:
from sklearn import metrics

In [None]:
df_a.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
df_b=pd.read_csv('5_b.csv')
df_b.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [None]:
cmr_b=confusion_matrix(df_b,'y','proba')
print('-'*80)
print('confusion matrix\n')
print(cmr_b)
print('-'*80)

--------------------------------------------------------------------------------
confusion matrix

[[9761   45]
 [ 239   55]]
--------------------------------------------------------------------------------


In [None]:
f1_b,asc_b=scores(cmr_b)
print('-'*80)
print('f1-score : {}'.format(f1_b))
print('Accuracy Score : {}'.format(asc_b))
print('-'*80)

--------------------------------------------------------------------------------
f1-score : 0.2791878172588833
Accuracy Score : 0.9718811881188119
--------------------------------------------------------------------------------


In [None]:
AUC_b=auc_score(df_b,'y','proba')
print('-'*80)
print('AUC Score : {}'.format(AUC_b))
print('-'*80)

100%|███████████████████████████████████████████████████████████████████████████| 10100/10100 [01:32<00:00, 109.11it/s]


--------------------------------------------------------------------------------
AUC Score : 0.9377570000000001
--------------------------------------------------------------------------------


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [None]:
df_c=pd.read_csv('5_c.csv')
df_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [None]:
def best_threshold(df,a,ps):
    d={}
    df1=pd.DataFrame(data=df)
    df1.sort_values(by=[ps],ascending=False,inplace=True)
    for i in tqdm(list(df1[ps].values)):
        A=0
        x=pd.DataFrame(data=df1[ps])
        x[x[ps]<i]=0
        x[x[ps]>=i]=1
        tp,tn,fp,fn=cm_scores(df1,x,a,ps)
        A=((500*fn)+(100*fp))
        d[A]=i
    return d.get(min(list(d.keys())))

In [None]:
bt=best_threshold(df_c,'y','prob')
print('-'*80)
print('Best Threshold Value : {}'.format(bt))
print('-'*80)

100%|█████████████████████████████████████████████████████████████████████████████| 2852/2852 [00:25<00:00, 112.02it/s]


--------------------------------------------------------------------------------
Best Threshold Value : 0.2300390278970873
--------------------------------------------------------------------------------


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [None]:
df_d=pd.read_csv('5_d.csv')
df_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [None]:
def MSE(df,n):
    tot=0
    for i in tqdm(range(n)):
        tot=tot+((df.loc[i][0]-df.loc[i][1])**2)
    return tot/n

In [None]:
def r2_score(df,n,a):
    ym=np.mean(df[a])
    sst=0
    ssr=0
    for i in tqdm(range(n)):
        sst=sst+((df.loc[i][0]-ym)**2)
        ssr=ssr+((df.loc[i][0]-df.loc[i][1])**2)
    return (1-(ssr/sst))

In [None]:
def MAPE(df,n):
    num=0
    den=0
    for i in tqdm(range(n)):
        num=num+np.abs(df.loc[i][1]-df.loc[i][0])
        den=den+df.loc[i][0]
    return num/den

In [None]:
mse=MSE(df_d,len(df_d))
print('-'*80)
print('MSE : {}'.format(mse))
print('-'*80)

100%|████████████████████████████████████████████████████████████████████████| 157200/157200 [00:38<00:00, 4108.62it/s]


--------------------------------------------------------------------------------
MSE : 177.16569974554707
--------------------------------------------------------------------------------


In [None]:
mape=MAPE(df_d,len(df_d))
print('-'*80)
print('MAPE : {}'.format(mape))
print('-'*80)

100%|████████████████████████████████████████████████████████████████████████| 157200/157200 [00:58<00:00, 2709.84it/s]


--------------------------------------------------------------------------------
MAPE : 0.1291202994009687
--------------------------------------------------------------------------------


In [None]:
r2=r2_score(df_d,len(df_d),'y')
print('-'*80)
print('r2-Score : {}'.format(r2))
print('-'*80)

100%|████████████████████████████████████████████████████████████████████████| 157200/157200 [00:58<00:00, 2691.75it/s]


--------------------------------------------------------------------------------
r2-Score : 0.9563582786990964
--------------------------------------------------------------------------------
