# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages


## A. Compute performance metrics for the given data '5_a.csv'
 <pre>  <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
df_a=pd.read_csv('5_a.csv',names=["y_actual","y_probable"],header = 0)
df_a.columns


Index(['y_actual', 'y_probable'], dtype='object')

In [3]:
df_a.head

<bound method NDFrame.head of        y_actual  y_probable
0           1.0    0.637387
1           1.0    0.635165
2           1.0    0.766586
3           1.0    0.724564
4           1.0    0.889199
...         ...         ...
10095       1.0    0.665371
10096       1.0    0.607961
10097       1.0    0.777724
10098       1.0    0.846036
10099       1.0    0.679507

[10100 rows x 2 columns]>

In [4]:
df_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10100 entries, 0 to 10099
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   y_actual    10100 non-null  float64
 1   y_probable  10100 non-null  float64
dtypes: float64(2)
memory usage: 157.9 KB


In [5]:
# write your code here for task A
df_a['y_predicted'] = np.where(df_a['y_probable'] >= 0.5, float(1), float(0))     
df_a.head

<bound method NDFrame.head of        y_actual  y_probable  y_predicted
0           1.0    0.637387          1.0
1           1.0    0.635165          1.0
2           1.0    0.766586          1.0
3           1.0    0.724564          1.0
4           1.0    0.889199          1.0
...         ...         ...          ...
10095       1.0    0.665371          1.0
10096       1.0    0.607961          1.0
10097       1.0    0.777724          1.0
10098       1.0    0.846036          1.0
10099       1.0    0.679507          1.0

[10100 rows x 3 columns]>

In [6]:
unique_classes_a = np.unique(df_a['y_actual'])
print(unique_classes_a)

[0. 1.]


In [7]:
def compute_confusion_matrix(df_a):
    
    actual_y_values_a = df_a.iloc[:, 0].values
    predicted_y_values_a = df_a.iloc[:, 2].values

    unique_classes = np.unique(actual_y_values_a)

    confusion_matrix = np.zeros((len(unique_classes_a), len(unique_classes_a)))
    
    for i in range(len(unique_classes_a)):
        for j in range(len(unique_classes_a)):
            confusion_matrix[i, j] = np.sum((unique_classes_a[i] == actual_y_values_a) & (unique_classes_a[j]  == predicted_y_values_a))
    
    return confusion_matrix

confusion_matrix_a = compute_confusion_matrix(df_a)

print(confusion_matrix_a)   

[[    0.   100.]
 [    0. 10000.]]


In [8]:
def compute_accuracy(confusion_matrix_a):
    accuracy = (confusion_matrix_a[1, 1] + confusion_matrix_a[0, 0]) / (confusion_matrix_a[1, 1] + confusion_matrix_a[1, 0] + confusion_matrix_a[0, 1] + confusion_matrix_a[0, 0])
    return accuracy
    
accuracy_score_a = compute_accuracy(confusion_matrix_a)

print(accuracy_score_a)

0.9900990099009901


In [9]:
def compute_f1_score(confusion_matrix_a):
    precision = confusion_matrix_a[1, 1] / (confusion_matrix_a[1, 1] + confusion_matrix_a[0, 1])
    recall = confusion_matrix_a[1, 1] / (confusion_matrix_a[1, 1] + confusion_matrix_a[1, 0])
    f1 = (2 * precision * recall) / (precision + recall)
    return f1
    
f1_score_a = compute_f1_score(confusion_matrix_a)

print(f1_score_a)

0.9950248756218906


In [10]:
def compute_all_thresholds_a(df_a):
    tpr_a_for_all_thresholds = []
    fpr_a_for_all_thresholds = []

    sorted_df_a = df_a.sort_values(by=['y_probable'], ascending=False)

    unique_probability_thresholds_a = sorted_df_a['y_probable'].unique()

    for threshold in tqdm(unique_probability_thresholds_a):
        sorted_df_a['y_predicted'] = np.where(sorted_df_a['y_probable'] >= threshold, 1, 0)
        cm_a_sorted = compute_confusion_matrix(sorted_df_a)
        tp = cm_a_sorted[1, 1]
        fp = cm_a_sorted[0, 1]
        fn = cm_a_sorted[1, 0]
        tn = cm_a_sorted[0, 0]
        tpr = tp / (tp + fn )
        fpr = fp / (fp + tn)
        
        tpr_a_for_all_thresholds.append(tpr)
        fpr_a_for_all_thresholds.append(fpr)

    return tpr_a_for_all_thresholds, fpr_a_for_all_thresholds

from tqdm import tqdm
all_tpr_5_a, all_fpr_5_a = compute_all_thresholds_a(df_a)
auc_score_5_a = np.trapz(all_tpr_5_a, all_fpr_5_a)
print('My Custom function ROC-AUC Score for 5_a.csv: ', auc_score_5_a)


100%|██████████| 10100/10100 [00:07<00:00, 1277.52it/s]

My Custom function ROC-AUC Score for 5_a.csv:  0.48829900000000004







## B. Compute performance metrics for the given data '5_b.csv'
<pre>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a>
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [11]:
df_b=pd.read_csv('5_b.csv',names=["y_actual","y_probable"],header = 0)
df_b.head()

Unnamed: 0,y_actual,y_probable
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [12]:
df_b.columns

Index(['y_actual', 'y_probable'], dtype='object')

In [13]:
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10100 entries, 0 to 10099
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   y_actual    10100 non-null  float64
 1   y_probable  10100 non-null  float64
dtypes: float64(2)
memory usage: 157.9 KB


In [14]:
df_b['y_predicted'] = np.where(df_b['y_probable'] >= 0.5, float(1), float(0))

In [15]:
unique_classes = np.unique(df_b['y_actual'])
print(unique_classes)

[0. 1.]


In [16]:
# write your code here for task B
cm_b = compute_confusion_matrix(df_b)
print(cm_b)

[[9761.  239.]
 [  45.   55.]]


In [17]:
def compute_accuracy(cm_b):
    accuracy = (cm_b[1, 1] + cm_b[0, 0]) / (cm_b[1, 1] + cm_b[1, 0] + cm_b[0, 1] + cm_b[0, 0])
    return accuracy
    
accuracy_score_b = compute_accuracy(cm_b)

print(accuracy_score_b)

0.9718811881188119


In [18]:
def compute_f1_score(cm_b):
    precision = cm_b[1, 1] / (cm_b[1, 1] + cm_b[0, 1])
    recall = cm_b[1, 1] / (cm_b[1, 1] + cm_b[1, 0])
    f1 = (2 * precision * recall) / (precision + recall)
    return f1
    
f1_score_b = compute_f1_score(cm_b)

print(f1_score_b)

0.2791878172588833


In [19]:

all_tpr_5_b, all_fpr_5_b = compute_all_thresholds_a(df_b)
auc_score_5_b = np.trapz(all_tpr_5_b, all_fpr_5_b)
print('My Custom function ROC-AUC Score for 5_b.csv: ', auc_score_5_b)

100%|██████████| 10100/10100 [00:08<00:00, 1217.98it/s]

My Custom function ROC-AUC Score for 5_b.csv:  0.9377570000000001





### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [20]:
df_c=pd.read_csv('5_c.csv',names=["y_actual","y_probable"],header = 0)
df_c.head()

Unnamed: 0,y_actual,y_probable
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [21]:
df_c.columns

Index(['y_actual', 'y_probable'], dtype='object')

In [22]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2852 entries, 0 to 2851
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   y_actual    2852 non-null   int64  
 1   y_probable  2852 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 44.7 KB


In [23]:
from tqdm import tqdm_notebook
def min_metric(data):
    s = data['y'].value_counts()
    P = s[1]
    N = s[0]
    tpr = []
    fpr = []
    metric={}
    for elem in tqdm_notebook(data['y_probable']):
        data['y_predicted']=predict(data,'y_probable',elem)
        tpr.append(confusion_matrix['tp']/P)
        fpr.append(confusion_matrix['fp']/N)
        confusion_matrix=cal_vals(data,'y','y_pred')
        metric_val=(500*confusion_matrix['fn'])+(100*confusion_matrix['fp'])
        metric[elem]=metric_val
        data.drop(columns=['y_pred'])
    return(metric)

data=pd.read_csv('5_c.csv')
print(data.head())
print(data.shape)


   y      prob
0  0  0.458521
1  0  0.505037
2  0  0.418652
3  0  0.412057
4  0  0.375579
(2852, 2)


In [24]:
def predicted_values(sorted_df_c,thresh_hold):
    y_pred=[]
    for label in sorted_df_c['y_probable']:
        if label<thresh_hold:
            y_pred.append(0)
        else:
            y_pred.append(1)
    return y_pred

In [25]:
# write your code for task C
sorted_df_c = df_c.sort_values(by=['y_probable'], ascending=False)

def min_metric(sorted_df_c):
    metric={}
    unique_probability_thresholds_c = sorted_df_c['y_probable'].unique()
    
    min_a = float('inf')
    min_threshold = 0
    
    for elem in tqdm(sorted_df_c['y_probable']):
        sorted_df_c['y_pred']= predicted_values(sorted_df_c,elem)
        cm_c = compute_confusion_matrix(sorted_df_c)
        metric_val=(500*cm_c[1,0])+(100*cm_c[0,1])
        metric[elem]=metric_val
        sorted_df_c.drop(columns=['y_pred'])
    return (metric)

result = min_metric(sorted_df_c)


100%|██████████| 2852/2852 [00:04<00:00, 614.37it/s]


In [26]:
temp = min(result.values()) 
res = [key for key in result if result[key] == temp]
print('the key:value pair for min value of the specified metric is-',res,temp)

the key:value pair for min value of the specified metric is- [0.2300390278970873] 141000.0



## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [27]:
df_d=pd.read_csv('5_d.csv',names=["y_actual","y_predict"],header = 0)
df_d.head()

Unnamed: 0,y_actual,y_predict
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [28]:
actual_y_values_d = df_d.iloc[:, 0].values
predicted_y_values_d = df_d.iloc[:, 1].values


In [29]:
def calculate_mse(y_actual, y_predicted):
    mse = np.mean((y_actual - y_predicted)**2)
    return mse

print(calculate_mse(actual_y_values_d, predicted_y_values_d))

177.16569974554707


In [30]:
def calculate_mean_absolute_percentage_error(y_actual, y_predicted):
    mape = np.mean((np.abs(y_actual - y_predicted)) / np.mean(y_actual)) * 100
    return mape

print(calculate_mean_absolute_percentage_error(actual_y_values_d, predicted_y_values_d))

12.912029940096867


In [31]:
# write your code for task 5d
    
def calculate_r2_score(y_train, y_predicted):
    y_train_bar = y_train.mean()
    # y_train_bar = np.mean(y_train)

    sum_squared_residual = ((y_train - y_predicted)**2).sum()
    sum_squared_total = ((y_train - y_train_bar)**2).sum()

    return 1 - (sum_squared_residual/sum_squared_total)

print(calculate_r2_score(actual_y_values_d, predicted_y_values_d))



0.9563582786990937
