### Sample program for calculating accuracy measures and plotting ROC / PR curve

#### Import libraries  

In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import f1_score, matthews_corrcoef

#### Parameters  

In [None]:
csv_in = 'class_pre1.csv'

#### CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Confusing matrix (混同行列) and measurements for prediction accuracy for each method  

In [None]:
print('A')
print('precision:', precision_score(df['label'], df['A']>=0.5))
print('recall:', recall_score(df['label'], df['A']>=0.5))
print('accuracy(Q2):', accuracy_score(df['label'], df['A']>=0.5))
print('f1:', f1_score(df['label'], df['A']>=0.5))
print('matthews:', matthews_corrcoef(df['label'], df['A']>=0.5))
#predA = (df['A']>=0.5).astype('int')
predA = df['A'].map(lambda x: 1 if x >= 0.5 else 0)
ctA = pd.crosstab(df['label'], predA)
fpA = ctA.loc[0,1]
tnA = ctA.loc[0,0]
print('false positive rate:', fpA/(fpA+tnA))
display(ctA)

In [None]:
print('B')
print('precision:', precision_score(df['label'], df['B']>=0.5))
print('recall:', recall_score(df['label'], df['B']>=0.5))
print('accuracy(Q2):', accuracy_score(df['label'], df['B']>=0.5))
print('f1:', f1_score(df['label'], df['B']>=0.5))
print('matthews:', matthews_corrcoef(df['label'], df['B']>=0.5))
#predB = (df['B']>=0.5).astype('int')
predB = df['B'].map(lambda x: 1 if x >= 0.5 else 0)
ctB = pd.crosstab(df['label'], predB)
fpB = ctB.loc[0,1]
tnB = ctB.loc[0,0]
print('false positive rate:', fpB/(fpB+tnB))

In [None]:
print('C')
print('precision:', precision_score(df['label'], df['C']>=0.5))
print('recall:', recall_score(df['label'], df['C']>=0.5))
print('accuracy(Q2):', accuracy_score(df['label'], df['C']>=0.5))
print('f1:', f1_score(df['label'], df['C']>=0.5))
print('matthews:', matthews_corrcoef(df['label'], df['C']>=0.5))
#predC = (df['C']>=0.5).astype('int')
predC = df['C'].map(lambda x: 1 if x >= 0.5 else 0)
ctC = pd.crosstab(df['label'], predC)
fpC = ctC.loc[0,1]
tnC = ctC.loc[0,0]
print('false positive rate:', fpC/(fpC+tnC))

#### ROC curve and its AUC for each method  

In [None]:
y_true = df['label']
y_score = df['A']
fprA, tprA, thresholdsA = roc_curve(y_true, y_score)
print('AUC(A):', roc_auc_score(y_true, y_score))
#print('AUC(A):', auc(fprA, tprA))  # from fpr and tpr
y_score = df['B']
fprB, tprB, thresholdsB = roc_curve(y_true, y_score)
print('AUC(B):', roc_auc_score(y_true, y_score))
#print('AUC(B):', auc(fprB, tprB))  # from fpr and tpr
y_score = df['C']
fprC, tprC, thresholdsC = roc_curve(y_true, y_score)
print('AUC(C):', roc_auc_score(y_true, y_score))
#print('AUC(C):', auc(fprC, tprC)) # from fpr and tpr

In [None]:
plt.plot(fprA, tprA, label='A')
plt.plot(fprB, tprB, label='B')
plt.plot(fprC, tprC, label='C')
plt.plot([0,1],[0,1])
plt.gca().set_aspect('equal', adjustable='box')
plt.title('ROC curve')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim(-0.1,1.1)
plt.ylim(-0.1,1.1)
plt.legend()
plt.show()

#### PR curve and its AUC for each method  

In [None]:
y_true = df['label']
y_scoreA = df['A']
precisionA, recallA, thresholdsA = precision_recall_curve(y_true, y_scoreA)
print('PR-AUC(A):', auc(recallA, precisionA))
y_scoreB = df['B']
precisionB, recallB, thresholdsB = precision_recall_curve(y_true, y_scoreB)
print('PR-AUC(B):', auc(recallB, precisionB))
y_scoreC = df['C']
precisionC, recallC, thresholdsC = precision_recall_curve(y_true, y_scoreC)
print('PR-AUC(C):', auc(recallC, precisionC))

In [None]:
plt.plot(recallA, precisionA, label='A')
plt.plot(recallB, precisionB, label='B')
plt.plot(recallC, precisionC, label='C')
plt.gca().set_aspect('equal', adjustable='box')
plt.title('PR curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim(-0.1,1.1)
plt.ylim(-0.1,1.1)
plt.legend()
plt.show()

#### (Accuracy around high score predictions to see the meaning of PR curves)  
accuracy: B > A > C (this corresponds to PR-AUC)  

In [None]:
display(df[['label','A']].sort_values(by='A', ascending=False).head())
display(df[['label','B']].sort_values(by='B', ascending=False).head())
display(df[['label','C']].sort_values(by='C', ascending=False).head())