In [227]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, roc_curve
from sklearn.linear_model import LogisticRegression as LR
from sklearn import metrics 
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import LeaveOneOut

In [205]:
data = pd.read_csv("../Data/10-14_features_filenames.csv")
data = data.dropna()
data = data.sample(frac=1)

In [206]:
data.head()

Unnamed: 0,canny_mean,canny_var,lapl_mean,lapl_var,image_mean,image_var,class,file_name
1314,51.208606,10435.87317,-0.001496,492.567363,57.651936,834.785225,no_dust,ESP_036503_1830_RED.NOMAP.browse-Block-12.jpg
774,49.782801,10216.28693,-0.000351,1400.632332,112.735055,822.330951,no_dust,ESP_011752_1115_RED.NOMAP.browse-Block-8.jpg
1470,0.094113,23.990047,-0.001533,81.191468,184.899508,235.312365,no_dust,ESP_037101_2655_RED.NOMAP.browse-Block-1.jpg
1383,78.17198,13822.99642,-0.001948,1354.542674,142.472385,1669.65144,no_dust,ESP_050933_1355_RED.NOMAP.browse-Block-12.jpg
1208,4.045172,1015.155379,0.000541,91.296399,235.442702,66.181007,no_dust,ESP_025105_2505_RED.NOMAP.browse-Block-10.jpg


In [207]:
data["class"].value_counts()

no_dust    1149
dust        396
Name: class, dtype: int64

In [294]:
# Use a simple regex to get unique HiRise ID location ID numbers.
# So that we can group images of the same region together.

# Separate out the digits which identify the photographed region:
# oooooo_tttt, i.e.
# <orbit_number>_<target_code>
from sklearn.model_selection import GroupKFold
import re
def target_regex(filename):
    region_string = re.search("[0-9]{6}_[0-9]{4}", filename)
    return int(region_string.group())

data["target_location"] = data["file_name"].apply(target_regex)

In [296]:
# Define image groups, input variables, output variables
groups = np.array(data["target_location"])
x = np.array(data.iloc[:,0:5])
y = np.array(data["class"])
y = [1 if name == "dust" else 0 for name in y]
print("Number of images: " + str(len(np.unique(groups))))
print("Unique image groups: " + str(len(np.unique(groups))))

Number of images: 402
Unique image groups: 402


In [299]:
# Show the 34% class imbalance
np.sum(data["class"] == "dust") / np.sum(data["class"] == "no_dust")

0.34464751958224543

In [300]:
# Use 10 folds,
# splitting the dataset into  CV sections
# with roughly 10 samples.
# Get accuracy, recall, precision.
# Sometimes no positive predictions are made,
# resulting in undefined precision, recall, and auc
def cv_scores(model, x, y, label, cv, groups=None):
    model_cv_results = cross_validate(model, x, y, cv=cv, groups=groups, scoring=["accuracy", "precision", "recall", "roc_auc"])
    model_cv_acc_mean = np.mean(model_cv_results["test_accuracy"])
    model_cv_precision_mean = np.mean(model_cv_results["test_precision"])
    model_cv_recall_mean = np.mean(model_cv_results["test_recall"])
    model_cv_auc_mean = np.mean(model_cv_results["test_roc_auc"])
    print(label)
    print("Accuracy: " + str(model_cv_acc_mean))
    print("Precision: " + str(model_cv_precision_mean))
    print("Recall: " + str(model_cv_recall_mean))
    print("AUC: " + str(model_cv_auc_mean))
    

In [301]:
# Define cv splitting objects for use in cv_scores function
group_kfold = GroupKFold(n_splits=10)
sfold = StratifiedKFold(10, shuffle=True)

In [302]:
# Logistic Regression - Without GroupKFold
lr_model = LR()
cv_scores(lr_model, x, y, "Logistic Regression With No Groups", sfold)

Logistic Regression With No Groups
Accuracy: 0.8168412232928363
Precision: 0.6239745416030112
Recall: 0.7371153846153846
AUC: 0.8790465440356744


In [303]:
# Logistic Regression - With GroupKFold
cv_scores(lr_model, x, y, "Logistic Regression With Groups", group_kfold, groups=groups)

Logistic Regression With Groups
Accuracy: 0.81865521575199
Precision: 0.6230617550991059
Recall: 0.7488302170995578
AUC: 0.8802468816590373


In [304]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 500)
cv_scores(rf, x, y, "Random Forest With No Groups", sfold)

Random Forest With No Groups
Accuracy: 0.8712023460410556
Precision: 0.7629365504567118
Recall: 0.7274358974358975
AUC: 0.9248295854603062


In [305]:
cv_scores(rf, x, y, "Random Forest With Groups", group_kfold, groups)

Random Forest With Groups
Accuracy: 0.8328780896522833
Precision: 0.6858115001839877
Recall: 0.6294112415947672
AUC: 0.8970086924580014


In [306]:
#Support Vector Classifier
from sklearn.svm import SVC
svc = SVC(random_state = 45, probability = True)
cv_scores(svc, x, y, "Support Vector Classifier With No Groups", sfold)

Support Vector Classifier With No Groups
Accuracy: 0.7728026811897779
Precision: 0.5337362851423346
Recall: 0.9344230769230769
AUC: 0.8903773983453618


In [307]:
cv_scores(svc, x, y, "Support Vector Classifier With Groups", group_kfold, groups)

  _warn_prf(average, modifier, msg_start, len(result))


Support Vector Classifier With Groups
Accuracy: 0.7506577293674068
Precision: 0.4511363447717926
Recall: 0.8538712367765926
AUC: 0.8941751828528753


In [287]:
def tts_predict(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    model.fit(x_train, y_train)
    y_prob = model.predict_proba(x_test)[:,1]
    y_predict = model.predict(x_test)
    return y_test, y_prob, y_predict

In [None]:
# Take 1000 train/test splits, 
# calculate means of precisions and accuracies
# from predictions.
precision_scores = []
recall_scores = []
auc_scores = []
acc_scores = []

for i in range(1000):
    y_test, y_prob, y_predict = tts_predict(rf, x, y)
    precision_scores.append(precision_score(y_test, y_predict))
    recall_scores.append(recall_score(y_test, y_predict))
    auc_scores.append(roc_auc_score(y_test, y_prob))
    acc_scores.append(accuracy_score(y_test, y_predict))
    
precision_mean = np.mean(precision_scores)
recall_mean = np.mean(recall_scores)
auc_mean = np.mean(auc_scores)
acc_mean = np.mean(acc_scores)

print("Random Forest Metrics - 1000 Train/Test Splits")
print("Accuracy Mean: " + str(acc_mean))
print("Precision Mean: " + str(precision_mean))
print("Recall Mean: " + str(recall_mean))
print("AUC Mean: " + str(auc_mean))

In [None]:
precision_mean = np.mean(precision_scores)
recall_mean = np.mean(recall_scores)
auc_mean = np.mean(auc_scores)
acc_mean = np.mean(acc_scores)

print("Linear Regression Metrics - 1000 Train/Test Splits")
print("Accuracy Mean: " + str(acc_mean))
print("Precision Mean: " + str(precision_mean))
print("Recall Mean: " + str(recall_mean))
print("AUC Mean: " + str(auc_mean))

In [None]:
from sklearn.metrics import plot_roc_curve
fig = plt.gcf()
fig.set_size_inches(8,6)
fig.suptitle("Logistic Regression ROC Curves - 10 Splits")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.plot([0, 1], [0, 1], linestyle="dotted")
for i in range(10):
    y_test, y_prob, y_predict = tts_predict(lr, x, y)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr)

In [None]:
fig = plt.gcf()
fig.set_size_inches(8,6)
fig.suptitle("Random Forest ROC Curves - 10 Splits")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.plot([0, 1], [0, 1], linestyle="dotted")
for i in range(10):
    y_test, y_prob, y_predict = tts_predict(rf, x, y)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr)

In [None]:
fig = plt.gcf()
fig.set_size_inches(8,6)
fig.suptitle("Support Vector Classifier ROC Curves - 10 Splits")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.plot([0, 1], [0, 1], linestyle="dotted")
for i in range(10):
    y_test, y_prob, y_predict = tts_predict(svc, x, y)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr)