# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm #imported to check the progress of the for loops


## A. Compute performance metrics for the given data '5_a.csv'
 <pre>  <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

**1. Compute Confusion Matrix**


In [4]:
#reading the csv files to pandas
df_a=pd.read_csv('5_a.csv')
df_a.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [5]:
#creating the new column predicted values of y
confusion_df_a = df_a
confusion_df_a["y^"] = np.where(confusion_df_a["proba"] >= 0.5, 1.0, 0.0)
confusion_df_a.head()

Unnamed: 0,y,proba,y^
0,1.0,0.637387,1.0
1,1.0,0.635165,1.0
2,1.0,0.766586,1.0
3,1.0,0.724564,1.0
4,1.0,0.889199,1.0


In [6]:
#calculating tp,tn,fp and fn using lenght mesurements of dataframes
TP = len(confusion_df_a[(confusion_df_a["y"] == 1.0) & (confusion_df_a["y^"] == 1.0)])
TN = len(confusion_df_a[(confusion_df_a["y"] == 0.0) & (confusion_df_a["y^"] == 0.0)])
FP = len(confusion_df_a[(confusion_df_a["y"] == 0.0) & (confusion_df_a["y^"] == 1.0)])
FN = len(confusion_df_a[(confusion_df_a["y"] == 1.0) & (confusion_df_a["y^"] == 0.0)])

In [9]:
#printing the confusion matrix
print("{:<6} Confusion Matrix".format(""))
print("=" * 30)
print("{:<10} Actual values".format(""))
print("-" * 25)
print("Predicted | {:<8} | {}".format("0","1"))
print("-" * 25)
print(" {:<8} | {:<8} | {} ".format("0",TN,FP))
print(" {:<8} | {:<8} | {} ".format("1",FN,TP))

       Confusion Matrix
           Actual values
-------------------------
Predicted | 0        | 1
-------------------------
 0        | 0        | 100 
 1        | 0        | 10000 


**2 . Compute F1 Score** 

In [7]:
#calculating precision, recall and F1 scores
precision = TP / (TP + FP)

recall = TP / (TP + FN)

F1_score = ((2 * precision * recall) / (precision + recall))

print("Precision = {}\n\nRecall = {}\n\nF1 score = {}".format(precision,recall,F1_score))

Precision = 0.9900990099009901

Recall = 1.0

F1 score = 0.9950248756218906


**3.Compute AUC Score**

In [8]:
#finding all the unique probability scores in the dataframes
unique_proba_values_df_a = -np.sort(-df_a.proba.unique())
print(unique_proba_values_df_a.round(decimals = 6))

[0.899965 0.899828 0.899825 ... 0.500058 0.500047 0.500019]


In [14]:
#keeping each unique probability scores as thresholds and class labels are found
num = 1
for i in tqdm(unique_proba_values):
    df_a[[("p",num)]] = np.where((df_a[["proba"]] >= i),1,0)
    num += 1
df_a.loc[[1664]]

100%|██████████████████████████████████████████████████████████████████████████| 10100/10100 [1:19:01<00:00,  2.13it/s]


Unnamed: 0,y,proba,y^,"(p, 1)","(p, 2)","(p, 3)","(p, 4)","(p, 5)","(p, 6)","(p, 7)",...,"(p, 10091)","(p, 10092)","(p, 10093)","(p, 10094)","(p, 10095)","(p, 10096)","(p, 10097)","(p, 10098)","(p, 10099)","(p, 10100)"
1664,1.0,0.899965,1.0,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [128]:
#deleting the proba and Y^ columns and converting the dataframe to numpy
df_a = df_a.drop(columns = ["proba","y^"])
numpy_df_a = df_a.to_numpy()

In [132]:
#initializing the empty list to store TPR and FPR values for AUC calculation
TPR = FPR = list()

#for loop to calculates tp,tn,fn and fp values of each thresholds
for j in tqdm(range(1,len(numpy_df_a))):
    #concatenating the actual values and predicted column j
    C = np.concatenate((numpy_df_a[:,0][:,None],numpy_df_a[:,j][:,None]), axis = 1)
    
    #initialising tp,tn,fn and fp values and calculating the same using the for loop
    T_P = T_N = F_N = F_P = 0
    for i in C:
        if i[0] == 1 and i[1] == 1:
            T_P += 1
        elif i[0] == 0 and i[1] == 0:
            T_N += 1
        elif i[0] == 0 and i[1] == 1:
            F_P += 1
        else:
            F_N += 1
            
    #appending the TPR and FPR of each column
    TPR.append((T_P) / (T_P + F_N))
    FPR.append((F_P) / (F_P + T_N))

100%|████████████████████████████████████████████████████████████████████████████| 10099/10099 [04:32<00:00, 37.03it/s]


In [133]:
#converting the TPR and FPR list into the array
tpr_array = np.asarray(TPR)
fpr_array = np.asarray(FPR)

In [134]:
#calculating the AUC
np.trapz(tpr_array, fpr_array)

0.4999999949999998

**4.Compute Accuracy Score** 


In [94]:
#Accuracy calculation
Accuracy = (TP+TN)/(TP+TN+FP+FN)
print("Accuracy = ", Accuracy)

Accuracy =  0.9900990099009901




## B. Compute performance metrics for the given data '5_b.csv'
<pre>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a>
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

**1.Compute Confusion Matrix**

In [10]:
#reading the csv files to pandas
df_b=pd.read_csv('5_b.csv')
df_b.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [11]:
#creating the new column predicted values of y
df_b["y^"] = np.where(df_b["proba"] >= 0.5, 1.0, 0.0)
df_b.head()

Unnamed: 0,y,proba,y^
0,0.0,0.281035,0.0
1,0.0,0.465152,0.0
2,0.0,0.352793,0.0
3,0.0,0.157818,0.0
4,0.0,0.276648,0.0


In [12]:
#calculating values needed for computing performace metrics
TP1 = len(df_b[(df_b["y"] == 1.0) & (df_b["y^"] == 1.0)])
TN1 = len(df_b[(df_b["y"] == 0.0) & (df_b["y^"] == 0.0)])
FP1 = len(df_b[(df_b["y"] == 0.0) & (df_b["y^"] == 1.0)])
FN1 = len(df_b[(df_b["y"] == 1.0) & (df_b["y^"] == 0.0)])

In [14]:
#printing the confusion matrix
print("{:<6} Confusion Matrix".format(""))
print("=" * 30)
print("{:<10} Actual values".format(""))
print("-" * 25)
print("Predicted | {:<8} | {}".format("0","1"))
print("-" * 25)
print(" {:<8} | {:<8} | {} ".format("0",TN1,FP1))
print(" {:<8} | {:<8} | {} ".format("1",FN1,TP1))

       Confusion Matrix
           Actual values
-------------------------
Predicted | 0        | 1
-------------------------
 0        | 9761     | 239 
 1        | 45       | 55 


**2.Compute F1 Score**

In [63]:
#calculating precision, recall and F1 scores
precision_2 = TP1 / (TP1 + FP1)

recall_2 = TP1 / (TP1 + FN1)

F1_score_2 = 2 / ((1/recall_2) + (1/precision_2))
print("Precision = {}\n\nRecall = {}\n\nF1_score = {}".format(precision_2,recall_2,F1_score_2))

Precision = 0.1870748299319728

Recall = 0.55

F1_score = 0.27918781725888325


**3.Compute AUC Score**

In [20]:
#finding all the unique probability scores in the dataframes
unique_proba_values_df_b = -np.sort(-df_b.proba.unique())
unique_proba_values_df_b.round(decimals = 6)

array([0.595294, 0.594808, 0.592198, ..., 0.100165, 0.100161, 0.100001])

In [76]:
#keeping each unique probability scores as thresholds and class labels are found
num = 1
for i in tqdm(unique_proba_values_df_b):
    df_b[[("p",num)]] = np.where((df_b[["proba"]] >= i),1,0)
    num += 1

100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [56:26<00:00,  2.98it/s]


In [95]:
#deleting the proba and Y^ columns and converting the dataframe to numpy
df_b = df_b.drop(columns = ["proba","y^"])
numpy_df_b = df_b.to_numpy()

In [300]:
#initializing the empty list to store TPR and FPR values for AUC calculation
TPR1 = FPR1 = list()

#for loop to calculates tp,tn,fn and fp values of each thresholds
for j in tqdm(range(1,len(numpy_df_b))):
    #concatenating the actual values and predicted column j
    C1 = np.concatenate((numpy_df_b[:,0][:,None],numpy_df_b[:,j][:,None]), axis = 1)
    
    #initialising tp,tn,fn and fp values and calculating the same using the for loop
    T_P1 = T_N1 = F_N1 = F_P1 = 0
    for i in C1:
        if i[0] == 1 and i[1] == 1:
            T_P1 += 1
        elif i[0] == 0 and i[1] == 0:
            T_N1 += 1
        elif i[0] == 1 and i[1] == 0:
            F_N1 += 1
        else:
            F_P1 += 1
            
    #appending the TPR and FPR of each column
    TPR1.append((T_P1) / (T_P1 + F_N1))
    FPR1.append((F_P1) / (F_P1 + T_N1))

100%|████████████████████████████████████████████████████████████████████████████| 10099/10099 [05:08<00:00, 32.76it/s]


In [301]:
#converting the TPR and FPR list into the array
tpr_array1 = np.asarray(TPR1)
fpr_array1 = np.asarray(FPR1)

In [303]:
#calculating the AUC
np.trapz(tpr_array1, fpr_array1)

0.49985000500000315

**4.Compute Accuracy Score**

In [104]:
#Accuracy calculation
Accuracy = (TP1+TN1)/(TP1+TN1+FP1+FN1)
print("Accuracy = ", Accuracy)

Accuracy =  0.9718811881188119


### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [2]:
#reading the csv files to pandas
df_c = pd.read_csv('5_c.csv')
df_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [3]:
#finding the unique probability values and sorting in desceniding values 
unique_proba_values_df_c = -np.sort(-df_c.prob.unique())
print(unique_proba_values_df_c)

[0.9577468  0.95143692 0.94863779 ... 0.02896366 0.02839574 0.02803799]


In [4]:
#keeping each unique probability scores as thresholds and class labels are found
num = 1
for i in tqdm(unique_proba_values_df_c):
    df_c[[("p",num)]] = np.where((df_c[["prob"]] >= i),1,0)
    num += 1

100%|██████████████████████████████████████████████████████████████████████████████| 2791/2791 [01:22<00:00, 33.76it/s]


In [5]:
#deleting the proba columns and converting the dataframe to numpy
df_c = df_c.drop(columns = ["prob"])
numpy_df_c = df_c.to_numpy()

In [6]:
#initializing the empty list to store scores
A = list()
for j in tqdm(range(1,len(unique_proba_values_df_c))):
    #concatenating the actual values and predicted column j
    C1 = np.concatenate((numpy_df_c[:,0][:,None],numpy_df_c[:,j][:,None]), axis = 1)
    
    #initialising fn and fp values and calculating the same using the for loop
    FP_1 = FN_1 = 0
    for i in C1:
        if i[0] == 0 and i[1] == 1:
            FP_1 += 1
        elif i[0] == 1 and i[1] == 0:
            FN_1 += 1
        else:
            continue
    
    #appending the scores of each column
    A.append((500 * FN_1) + (100 * FP_1))

100%|█████████████████████████████████████████████████████████████████████████████| 2790/2790 [00:18<00:00, 148.53it/s]


In [7]:
#zipping the dictionary with scores and unique values
dictionary = dict(zip(A,unique_proba_values_df_c))

In [8]:
#sorting the list and finding the list scores
sorted_ROC = sorted(A)
print("The lowest values of metric A for the given data",sorted_ROC[0])

The lowest values of metric A for the given data 141000


In [10]:
#printing the threshold values
print(" the best threshold of probability which gives lowest values of metric A for the given data",dictionary.get(sorted_ROC[0]))

 the best threshold of probability which gives lowest values of metric A for the given data 0.2300390278970873



## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

**1.Compute Mean Square Error**

In [22]:
#reading the csv files to pandas
df_d=pd.read_csv('5_d.csv')
df_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [23]:
#creating the columns with squared error
df_d["error"] = (df_d["y"] - df_d["pred"])**2

#calculating the mean squared error and printing the same
Mean_squared_error = (df_d["error"].sum() / len(df_d))
print("The mean squared error is :", Mean_squared_error)

The mean squared error is : 177.16569974554707


**2.Compute MAPE**

In [24]:
#creating the columns with absolute error
df_d["Absolute_error"] = abs(df_d["y"] - df_d["pred"])

#calculating the modified mean absolute percentage error
MAPE = ((df_d["Absolute_error"].sum()) / (df_d["y"].sum()))
print("The modified mean absolute percentage error is :", MAPE)

The modified mean absolute percentage error is : 0.1291202994009687


**3.Compute R^2 error**

In [25]:
#calculating the mean of actual values
df_mean = df_d["y"].mean()

In [26]:
#calculating the total and residuals sum of squared 
df_d["Absolute_error_squared"] = df_d["Absolute_error"]**2
df_d["std"] = (df_d[["pred"]] - df_mean)**2

In [31]:
#calculating the R^2 
print("The co-efficient of determination is ",(1 - (df_d["Absolute_error_squared"].sum() / df_d["std"].sum())))

The co-efficient of determination is  0.9544134826849549
