In [51]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, precision_recall_curve, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score

In [53]:
modern = pd.read_csv("modern_met_fe.csv")

In [54]:
modern.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215649 entries, 0 to 215648
Data columns (total 32 columns):
 #   Column                                Non-Null Count   Dtype
---  ------                                --------------   -----
 0   highlighted                           215649 non-null  bool 
 1   obj_year                              215649 non-null  int64
 2   department_drawings_and_prints        215649 non-null  int64
 3   department_european_paintings         215649 non-null  int64
 4   department_modern_and_cont_art        215649 non-null  int64
 5   department_photographs                215649 non-null  int64
 6   department_robert_lehman              215649 non-null  int64
 7   department_libraries                  215649 non-null  int64
 8   other_obj_name                        215649 non-null  int64
 9   obj_name_painting                     215649 non-null  int64
 10  obj_name_photo                        215649 non-null  int64
 11  obj_name_print            

In [55]:
modern.highlighted.value_counts()

False    214899
True        750
Name: highlighted, dtype: int64

In [56]:
highlight_ratio = len(modern[modern.highlighted])/len(modern)
highlight_ratio

0.0034778737670937494

### Model Evaluation Metrics

This model is being built for when the Met is accepting an object into its collection. As we have agreed on 'highligted' as our measure of visibility, a donor would want the piece to be definitely 'highlighted' if they are donating it to the museum indeed. 

Given the above scenariio, let's think about our confusion matrix/ what metrics we care about: 
- If we have a FN: our model predicts that a donated object will not be highlighted and it actually is. Well this is not horrible to be honest, if someone decides to donate even if the object is not going to be highlighted (GOOD FOR THEM, WE LOVE THEM! TRUE LOVER OF ARTS & CULTURE), then they'd be happily surpised to find out that it is highlighted. 
- On the other hand, if we have a FP: this is not so good about our model, after all we don't want to mislead a donor telling them this object will most likely be highlighted and then let them down by it not being highlighted. 

Given the above, our key metric in building this model will be *PRECISION*. 

### Baselining

In [29]:
x_train, x_test, y_train, y_test = train_test_split(modern.iloc[:, 1:], modern.iloc[:, 0], 
                                                    test_size = 0.2, random_state=0)

In [30]:
logreg = LogisticRegression(C=100, solver='liblinear', max_iter=500000)
logreg.fit(x_train, y_train)

LogisticRegression(C=100, max_iter=500000)

**Hard Predictions & Confusion Matrix**

In [None]:
y_predict_hard = logreg.predict(x_test)
y_predict_hard

In [None]:
logreg_confusion = confusion_matrix(y_test, y_predict_hard)
logreg_confusion

In [None]:
plt.figure(dpi=80)
sns.heatmap(logreg_confusion, annot=True, \
            xticklabels=['not','highlighted'], yticklabels=['not','highlighted'])
plt.xlabel('Highlighting Predicted')
plt.ylabel('Highlighting Actual')
plt.title('Logreg Confusion Matrix: Modern Met')
plt.savefig("Logreg Confusion Matrix for Modern Met.png", bbox_inches='tight');

**Probability Predictions & Precision Recall Curves**

In [None]:
y_predict_prob =logreg.predict_proba(x_test)
y_predict_prob = y_predict_prob[:, 1]
y_predict_prob

In [None]:
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_test, y_predict_prob)

In [None]:
plt.figure(dpi=80)
plt.plot(threshold_curve, precision_curve[1:],label='precision')
plt.plot(threshold_curve, recall_curve[1:], label='recall')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability, label as highlight-possible)');
plt.title('Precision and Recall Curves for Modern Met')
plt.savefig("Precision and Recall Curves for Modern Met.png", bbox_inches='tight');

In [None]:
plt.figure(dpi=80)
plt.plot(recall_curve[1:], precision_curve[1:],label='precision')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve for Modern Met")
plt.savefig("Precision-Recall Curve for Modern Met.png", bbox_inches='tight');

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_predict_prob)

In [None]:
plt.figure(dpi=80)

plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for predicting higlighting at the Met Modern Section')
plt.savefig("ROC curve for predicting higlighting at the Met Modern Section.png", bbox_inches='tight');
print("ROC AUC score = ", roc_auc_score(y_test, y_predict_prob))

**Scoring**

In [None]:
train_score = logreg.score(x_train, y_train)
test_score = logreg.score(x_test, y_test)
print(f"Train score: {train_score}, Test score: {test_score}")

In [None]:
logreg_precision = precision_score(y_test, y_predict_hard)
logreg_recall = recall_score(y_test, y_predict_hard)
logreg_f1 = f1_score(y_test, y_predict_hard)
print(f"Precision score: {logreg_precision}, Recall score: {logreg_recall}, F1 score: {logreg_f1}")

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_predict_prob)
roc_auc_logreg = roc_auc_score(y_test, y_predict_prob)
print(f"ROC - AUC score: {roc_auc_logreg}")

**Find Threshold for dealing with Class Imbalance (Manually)**

In [None]:
y_pred_prob_df = pd.DataFrame(logreg.predict_proba(x_test))
print(y_pred_prob_df.iloc[:10, :])
threshold_list = [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,.7,.75,.8,.85,.9,.95,.99]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    Y_test_pred = y_pred_prob_df.applymap(lambda x: 1 if x>i else 0)
    test_precision = precision_score(y_test, Y_test_pred.iloc[:,1])
    print('Our testing precision is {}'.format(test_precision))

    print(confusion_matrix(y_test, Y_test_pred.iloc[:,1]))

In [None]:
y_pred_prob_df = pd.DataFrame(logreg.predict_proba(x_test))
Y_test_pred = y_pred_prob_df.applymap(lambda x: 1 if x>0.334679 else 0)
test_precision = precision_score(y_test, Y_test_pred.iloc[:,1])
test_precision

**Class Imbalance (Plotting)**

In [None]:
X_val, y_val = x_test, y_test 

thresh_ps = np.linspace(.10,.50,1000)
model_val_probs = logreg.predict_proba(X_val)[:,1] 

f1_scores, prec_scores, rec_scores, acc_scores = [], [], [], []
for p in thresh_ps:
    model_val_labels = model_val_probs >= p
    f1_scores.append(f1_score(y_val, model_val_labels))    
    prec_scores.append(precision_score(y_val, model_val_labels))
    rec_scores.append(recall_score(y_val, model_val_labels))
    acc_scores.append(accuracy_score(y_val, model_val_labels))
    
plt.plot(thresh_ps, f1_scores)
plt.plot(thresh_ps, prec_scores)
plt.plot(thresh_ps, rec_scores)
plt.plot(thresh_ps, acc_scores)

plt.title('Metric Scores vs. Positive Class Decision Probability Threshold')
plt.legend(['F1','Precision','Recall','Accuracy'], bbox_to_anchor=(1.05, 0), loc='lower left')
plt.xlabel('P threshold')
plt.ylabel('Metric score')

best_f1_score = np.max(f1_scores) 
best_prec_score = np.max(prec_scores)
best_thresh_p = thresh_ps[np.argmax(prec_scores)]

print('Logistic Regression Model best Precision score %.3f at prob decision threshold >= %.6f' 
      % (best_prec_score, best_thresh_p))

In [None]:
plt.figure(dpi=80)
sns.heatmap(logreg_confusion, annot=True, \
            xticklabels=['not','highlighted'], yticklabels=['not','highlighted'])
plt.xlabel('Highlighting Predicted')
plt.ylabel('Highlighting Actual')
plt.title('Logreg Confusion Matrix: Modern Met')
plt.savefig("Logreg Confusion Matrix for Modern Met.png", bbox_inches='tight');