# Compute performance metrics for the given Y and Y_score without sklearn

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

In [2]:
### function to create the confusion matrix
def confusion_matrix(y,y_pred):
    conf_matrix=np.zeros((2,2))
    classes=np.unique(y)
    for i in range(2):
        for j in range(2):
            conf_matrix[i,j]=np.sum((y==classes[i]) & (y_pred ==classes[j]))
    return conf_matrix

### function to create the accuracy score based i=on confusion matrix
def accuracy_score(conf_matrix):
    total=np.sum(conf_matrix)
    total_tn_tp=np.trace(conf_matrix)
    accuracy=total_tn_tp/total
    print("Accuracy score is",accuracy)
    return

### function to create the f1 score
def f1_score(conf_matrix):    
    tot_positive=conf_matrix.item((1,1))
    tp_fp=conf_matrix.item((1,0))+conf_matrix.item((1,1))
    precision=tot_positive/tp_fp
    print("Precision is ",precision)
    
    tp_fn=conf_matrix.item((0,1))+conf_matrix.item((1,1))
    recall=tot_positive/tp_fn
    print("Recall is ",recall)
    
    f1=2*((precision*recall)/(precision+recall))
    print("F1 Score is " ,f1)
    return 

### function to create the auc score
def auc_score(y,y_prob,thresholds):
    fpr=[]
    tpr=[]
    for threshold in thresholds:
        
        y_pred = np.where(y_prob >= threshold, 1, 0)
        #y_pred=np.sort(np.array(y_pred))[::-1]
        
        fp = np.sum((y_pred == 1) & (y == 0))
        tp = np.sum((y_pred == 1) & (y == 1))

        fn = np.sum((y_pred == 0) & (y == 1))
        tn = np.sum((y_pred == 0) & (y == 0))
       
        fpr.append(fp / (fp + tn))
        tpr.append(tp / (tp + fn))

        #auc=np.trapz(tpr,fpr)
   # print(auc)
    return (tpr,fpr)

In [3]:
### load the 5_a csv file into data frame df_a
df_a=pd.read_csv("5_a.csv")
df_a[:5]

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [4]:
### calculated the class labels based on the condition and stored in 'y_pred' column
df_a['y_pred']= df_a.apply(lambda x: 1 if x.proba >0.5 else 0,axis=1)
df_a[:5]

Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1


In [5]:
### stores the columns of dataframe in individual colums  
y=list(df_a['y'])
y_pred=list(df_a['y_pred'])
y_prob=list(df_a['proba'])
#print(x)

In [6]:
### calculated the confusion matrix. accuracy score, recall, precision and f1 score by calling respective functions.
conf_matrix=confusion_matrix(y,y_pred)
print("Confusion matrix is")
print(conf_matrix)
accuracy_score(conf_matrix)
f1_score(conf_matrix)

Confusion matrix is
[[    0.   100.]
 [    0. 10000.]]
Accuracy score is 0.9900990099009901
Precision is  1.0
Recall is  0.9900990099009901
F1 Score is  0.9950248756218906


In [7]:
### sort the vales of proba colums and stored in threshold variable in descending order
thresholds=np.sort(np.array(df_a['proba']))[::-1]
#print(thresholds)

In [8]:
### selected the respetcive columns and stored in the variable y1,y_prob1
y1=df_a['y']
y_prob1=df_a['proba']

In [9]:
### calculated the auc score based on fpr,tpr
tpr,fpr=auc_score(y1,y_prob1,thresholds)
#auc_score(y1,y_prob1,thresholds)
auc=np.trapz(tpr,fpr)

In [10]:
print("AUC Score is : ",auc)

AUC Score is :  0.48829900000000004


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [11]:
### load the 5_b csv file into data frame df_b
df_b=pd.read_csv("C:\\Users\\poonam\\Desktop\\FIU\\Applied AI Course\\Assignment 5\\5_b.csv")
df_b[:5]

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [12]:
### calculated the class labels based on the condition and stored in 'y_pred' column
df_b['y_pred']= df_b.apply(lambda x: 1 if x.proba >0.5 else 0,axis=1)
#df_b[:5]

In [13]:
### stores the columns of dataframe in individual variables 
yb=df_b['y']
yb_pred=df_b['y_pred']
yb_prob=df_b['proba']
#print(x)

In [14]:
### calculated the confusion matrix. accuracy score, recall, precision and f1 score by calling respective functions.
conf_matrix_b=confusion_matrix(yb,yb_pred)
print("Confusion matrix is")
print(conf_matrix_b)

accuracy_score(conf_matrix_b)
f1_score(conf_matrix_b)

Confusion matrix is
[[9761.  239.]
 [  45.   55.]]
Accuracy score is 0.9718811881188119
Precision is  0.55
Recall is  0.1870748299319728
F1 Score is  0.2791878172588833


In [15]:
### sort the vales of proba colums and stored in threshold variable in descending order
thresholds_b=np.sort(np.array(df_b['proba']))[::-1]

In [16]:
### selected the respetcive columns and stored in the variable y1,y_prob1
yb1=df_b['y']
yb_prob1=df_b['proba']

In [17]:
### calculated the auc score based on fpr,tpr
#auc_score(yb1,yb_prob1,thresholds_b)

In [18]:
### calculated the auc score based on fpr,tpr
tpr,fpr=auc_score(yb1,yb_prob1,thresholds_b)
#auc_score(y1,y_prob1,thresholds)
auc=np.trapz(tpr,fpr)
print("AUC Score is : ",auc)

AUC Score is :  0.9377570000000001


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [19]:
data=pd.read_csv('5_c.csv')


In [20]:
dict_matrix_A={}

unique_prob_score=data['prob'].unique()
for i in unique_prob_score:
    data['y_pred']=np.where(data['prob']>=i,1,0)
    FP=((data['y']==0.0)&(data['y_pred']==1.0)).sum()
    FN=((data['y']==1.0)&(data['y_pred']==0.0)).sum()
    A=((500*FN)+(100*FP))
    dict_matrix_A[i]=A
least_A=min(dict_matrix_A.values())
print(least_A)
best_thresholds=[threshold for threshold,m_A in dict_matrix_A.items() if m_A==least_A ]
print("Best Threshold is: ",best_thresholds)


141000
Best Threshold is:  [0.2300390278970873]


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [21]:
### function to create mean square error
def mean_sqaure_error(y,y_pred):
    mse=np.square(np.subtract(y,y_pred)).mean()
    print("Mean Sqaure Error is ",mse )
    return

### function to create mean absolute error
def mean_absolute_per_error(y,y_pred):
    mape= np.mean(np.abs((y_pred-y) /np.mean(y))) 
    print("Modified Mean absolute percentage Error is ",mape )
    return

### function to create r2 score
def r2_score(y,y_pred):

    ymean=np.mean(y)
    #print(ymean)
    sstotal=np.square(np.subtract(y,ymean)).sum()
    #print(sstotal)

    ssresidual=np.square(np.subtract(y,y_pred)).sum()
    #print(ssresidual)

    r2=1-(ssresidual/sstotal)
    print("r2 score is ",r2)
    return

In [22]:
### load the csv file into data frame
df_d=pd.read_csv("5_d.csv")
#df_d[:5]

In [23]:
### store the indicidual columns into varaiable
y=df_d['y']
y_pred=df_d['pred']

In [24]:
### call the functions 
mean_sqaure_error(y,y_pred)
mean_absolute_per_error(y,y_pred)
r2_score(y,y_pred)

Mean Sqaure Error is  177.16569974554707
Modified Mean absolute percentage Error is  0.12912029940096315
r2 score is  0.9563582786990937
