# Compute performance metrics for the given Y and Y_score without sklearn

In [64]:
from tqdm import tqdm_notebook #purpose of import is just to check progress
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [11]:
from tqdm import tqdm_notebook #purpose of import is just to check progress
import numpy as np
import pandas as pd

In [80]:
dataset_a = pd.read_csv("5_a.csv")

In [3]:
def predict(dataset, y, threshold):
    y_pred = []
    for i in dataset[y]:
        if i<threshold:
            y_pred.append(0)
        else:
            y_pred.append(1)
    return y_pred

def compare_values(dataset,y,y_pred):
    count_tp = 0
    count_tn = 0
    count_fp = 0
    count_fn = 0
    for i in range(len(dataset)):
        if (dataset.y[i] == 0) and (dataset.y_pred[i] == 0):
            count_tn+=1
        if (dataset.y[i] == 0) and (dataset.y_pred[i] == 1):
            count_fp+=1
        if (dataset.y[i] == 1) and (dataset.y_pred[i] == 0):
            count_fn+=1
        if (dataset.y[i] == 1) and (dataset.y_pred[i] == 1):
            count_tp+=1
            
    confusion_matrix = {"tn":count_tn, "fn":count_fn, "fp":count_fp, "tp":count_tp}
    return confusion_matrix


In [58]:
def performance_metrics(dataset):
    y_list = list(dataset["y"])
    dataset["y_pred"] = predict(dataset, "proba", 0.5)
    confusion_matrix = compare_values(dataset, "y", "y_pred")
    dataset.drop(columns = ["y_pred"])
    dataset = dataset.sort_values(by = "proba", ascending = False)
    positive_count = dataset.y.value_counts()[1]
    precision = confusion_matrix["tp"]/(confusion_matrix["tp"]+confusion_matrix["fp"])
    recall = confusion_matrix["tp"]/positive_count
    F1 = 2*precision*recall/(precision+recall)
    accuracy = (confusion_matrix["tp"]+confusion_matrix["tn"])/len(dataset)
    #calculating AUC
    negative_count = dataset.y.value_counts()[0]
    tp_rate = []
    fp_rate = []
    for i in tqdm_notebook(dataset["proba"]):
        dataset["y_pred"] = predict(dataset, "proba", i)
        confusion_matrix_1 = compare_values(dataset, "y", "y_pred")
        tp_rate.append(confusion_matrix_1["tp"]/positive_count)
        fp_rate.append(confusion_matrix_1["fp"]/negative_count)
    print(positive_count, negative_count)
    auc = np.trapz(tp_rate, fp_rate)
    
    print("Confusion Matrix : ", confusion_matrix, "\nPrecision : ", precision, "\nRecall : ", recall, "\nF1 Score : ", F1, "\nAUC score : ", auc, "\nAccuracy : ", accuracy)
    return confusion_matrix, precision, recall, F1, auc, accuracy

In [83]:
dataset_a1 = dataset_a[:1000]

In [84]:
confusion_matrix, precision, recall, F1, auc, accuracy = performance_metrics(dataset_a)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(dataset["proba"]):


HBox(children=(FloatProgress(value=0.0, max=10100.0), HTML(value='')))


10000 100
Confusion Matrix :  {'tn': 0, 'fn': 0, 'fp': 100, 'tp': 10000} 
Precision :  0.9900990099009901 
Recall :  1.0 
F1 Score :  0.9950248756218906 
AUC score :  0.48829900000000004 
Accuracy :  0.9900990099009901


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [60]:
dataset_b = pd.read_csv("5_b.csv")

In [61]:
confusion_matrix_b, precision_b, recall_b, F1_b, auc_b, accuracy_b = performance_metrics(dataset_b)
print("Confusion Matrix : ", confusion_matrix_b, "\nPrecision : ", precision_b, "\nRecall : ", recall_b, "\nF1 Score : ", F1_b, "\nAUC score : ", auc_b, "\nAccuracy : ", accuracy_b)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(dataset["proba"]):


HBox(children=(FloatProgress(value=0.0, max=10100.0), HTML(value='')))


100 10000
Confusion Matrix :  {'tn': 9761, 'fn': 45, 'fp': 239, 'tp': 55} 
Precision :  0.1870748299319728 
Recall :  0.55 
F1 Score :  0.2791878172588833 
AUC score :  0.9377570000000001 
Accuracy :  0.9718811881188119
Confusion Matrix :  {'tn': 9761, 'fn': 45, 'fp': 239, 'tp': 55} 
Precision :  0.1870748299319728 
Recall :  0.55 
F1 Score :  0.2791878172588833 
AUC score :  0.9377570000000001 
Accuracy :  0.9718811881188119


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset_c = pd.read_csv("5_c.csv")

In [55]:
def minimum_metrics(dataset):
    y_list = list(dataset["y"])
    metric = {}
    for i in tqdm_notebook(dataset["prob"]):
        dataset["y_pred"] = predict(dataset, "prob", i)
        confusion_matrix = compare_values(dataset, "y", "y_pred")
        metric_value = 500*confusion_matrix["fn"]+100*confusion_matrix["fp"]
        metric[i] = metric_value
        dataset.drop(columns = ["y_pred"])
    return metric

In [56]:
dataset_c = dataset_c.sort_values(by = "prob", ascending = False)
result = minimum_metrics(dataset_c)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(dataset["prob"]):


HBox(children=(FloatProgress(value=0.0, max=2852.0), HTML(value='')))




In [57]:
min_metric = min(result.values())
min_key = [key for key in result if result[key]==min_metric]
print("The minimum value of specified metric is : ", min_key, min_metric)

The minimum value of specified metric is :  [0.2300390278970873] 141000


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [25]:
dataset_d = pd.read_csv("5_d.csv")

In [51]:
def error(dataset, col1, col2):
    error_list = []
    for index, (value_1, value_2) in enumerate(zip(dataset[col1], dataset[col2])):
        error_list.append(value_1 - value_2)
    return error_list

def ss_res(dataset, column):
    ss_res = 0
    for index, value in enumerate(dataset[column]):
        ss_res = ss_res+value*value
    return ss_res

def mse(dataset, column):
    return ss_res(dataset, column)/len(dataset[column])

def abs_error(dataset, column):
    abs_error_list = []
    for index, value in enumerate(dataset[column]):
        abs_error_list.append(abs(value))
    return abs_error_list

def mape(dataset, col1, col2):
    mape = sum(dataset[col1])/sum(dataset[col2])
    return mape

def ss_tot(dataset, column):
    ss_tot = 0
    mean = dataset["y"].mean()
    for index, value in enumerate(dataset[column]):
        ss_tot = ss_tot+(value-mean)**2
    return ss_tot

In [52]:
dataset_d["error"] = error(dataset_d, "y", "pred")
dataset_d["abs_error"] = abs_error(dataset_d, "error")
MSE = mse(dataset_d, "error")
MAPE = mape(dataset_d, "abs_error", "y")
SS_RES = ss_res(dataset_d, "error")
SS_TOT = ss_tot(dataset_d, "y")
RSQE = 1-SS_RES/SS_TOT
print("Mean Squared Error: ", MSE, "\nMean Absolute Percentage Error: ", MAPE, "\nR Squared Error: ", RSQE)

Mean Squared Error:  177.16569974554707 
Mean Absolute Percentage Error:  0.1291202994009687 
R Squared Error:  0.9563582786990964
