# About

Compilation of codes to test the classifiers.

In [None]:
def testClassifier(X, y, model_name, parameters = [], test_prop = 0.2):
    """
    @param X > X_train, X_calib, X_test
    @param y > y_train, y_calib, y_test
    @param model: String of the name of models we want to test
    Usage: This function is used to test the performance of a given classifier on confusion matrix,
    precision, recall, f-score, etc. We allow fine-tuning to happen within this function using gridSearchCV.
    Return: None
    NOTE: We do not allow make_pipeline(StandardScaler(), classifier) for now.
    """
    # Set random_state = 1 to compare between models.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_prop, random_state = 1)

    # Create and cross-validate the models over parameter space.
    # The cross-validated dataset is X_train, y_train.
    print("Below are the results of %s" %(model_name))
    weights = {0:3, 1:1}
    if model_name == "Logistic":
        model = LogisticRegression(solver = "lbfgs", penalty = "l2",class_weight=weights)
    elif model_name == "SVM":
        model = GridSearchCV(SVC(probability = True, class_weight = weights),parameters)
    elif model_name == "RandomForest":
        model = GridSearchCV(RandomForestClassifier(class_weight = weights), parameters)

    # Start predicting on test set using best model on X_test, y_test.
    model.fit(X_train, y_train)
    try:
        best = model.best_estimator_
    except:
        best = model
    yhat = best.predict(X_test)

    print("Displaying prediction")
    # Display prediction result as follows: confusion matrix, accuracy, precision, recall, fscore
    prediction = list(map(round, yhat))

    ## Confusion matrix
    cm = confusion_matrix(y_test, prediction)
    tn, fp, fn, tp = cm.ravel() # Read from top left > bottom right
    print ("Confusion Matrix : \n", cm)

    ## Accuracy, precision, etc.
    print('Accuracy =  %.2f' %(accuracy_score(y_test, prediction)))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, prediction, pos_label = 1,
                                                                         average = "binary")
    print("Precision = %.2f\n Recall = %.2f\n F-score = %.2f" %(precision, recall, fscore))

    return X_test, y_test, best

In [None]:
# Define all variables Xiao used
df600["has_title"] = [int(has_title) for has_title in df600["title"].isna()]
df600["has_description"] = [int(has_description) for has_description in df600["description"].isna()]

In [None]:
# Model 1: Use every numerical variable
# TODO: Lack transcript medical term and description medical term
# keyword_description_cosine; keyword_title_cosine; video_duration; ARI 
pl = ["has_description", "ARI", "video_duration", "keyword_title_cosine"]
xiaof1 = ["has_description", "has_title", "ARI", "active_verb","summary_words"]
xiaof1 += ["transition_words","video_duration","word_count","sentence_count","word_unique"]
xiaof2 = ["text_confidence","ARIf2","active_verbf2","summary_wordsf2","transition_wordsf2","speech_confidence"]
xiaof2 += ["scene_count","object_count","word_uniquef2","sentence_countf2"]
X = df600[xiaof1+xiaof2]
y = df600["understand"]

# Set random_state = 1 to compare between models. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
X_train, X_calib, y_train, y_calib = train_test_split(X_train, y_train, test_size = 0.12, random_state = 1)

In [None]:
# Testing for logistic regression
model_name = "Logistic"
X_test, y_test, model = testClassifier(X, y, model_name)
print(model.coef_)
createROC(X_test, y_test, model, model_name)

In [None]:
# Testing SVM
model_name = "SVM"
grid = {'kernel':['rbf'], 'C': np.arange(1,10,2)}
X_test, y_test, model = testClassifier(X, y, model_name, parameters = grid)
createROC(X_test, y_test, model, model_name)

In [None]:
# Testing Random Forest
n_estimators = np.arange(10, 200, 20)
max_depth = np.arange(1,5)
grid = {'n_estimators':n_estimators, 'max_depth':max_depth}
model_name = "RandomForest"
X_test, y_test, model = testClassifier(X, y, model_name, parameters = grid)
#print(model.coef_)
createROC(X_test, y_test, model, model_name)