In [3]:
import pandas as pd
from sklearn import *
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from utils import *

In [4]:
# use this to train and VALIDATE your solution
train_df = pd.read_csv("./data/quora_train_data.csv")

# use this to provide the expected generalization results
test_df = pd.read_csv("./data/quora_test_data.csv")

In [5]:
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
q1_test  =  cast_list_as_strings(list(test_df["question1"]))
q2_test  =  cast_list_as_strings(list(test_df["question2"]))
all_questions = q1_train + q2_train

In [6]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)

In [7]:
X_tr_q1q2 = get_features_from_df(train_df,count_vectorizer)
X_te_q1q2  = get_features_from_df(test_df, count_vectorizer)

In [8]:
y_train = train_df["is_duplicate"].values
y_test = test_df["is_duplicate"].values

In [10]:
perceptron = joblib.load("model_artifacts/perceptron_model.joblib", mmap_mode=None)
y_train_pred = perceptron.predict(X_tr_q1q2)

accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)
roc_auc = roc_auc_score(y_train, perceptron.decision_function(X_tr_q1q2))

print("Perceptron Metrics Train")
print("Train Accuracy: {:.4f}".format(accuracy))
print("Train Precision: {:.4f}".format(precision))
print("Train Recall: {:.4f}".format(recall))
print("Train F1-score: {:.4f}".format(f1))
print("Train ROC AUC: {:.4f}".format(roc_auc))

Perceptron Metrics Train
Train Accuracy: 0.7744
Train Precision: 0.7106
Train Recall: 0.6549
Train F1-score: 0.6816
Train ROC AUC: 0.8518


In [1]:
y_test_pred = perceptron.predict(X_te_q1q2)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, perceptron.decision_function(X_te_q1q2))

print("Perceptron Metrics Test")
print("Test Accuracy: {:.4f}".format(accuracy))
print("Test Precision: {:.4f}".format(precision))
print("Test Recall: {:.4f}".format(recall))
print("Test F1-score: {:.4f}".format(f1))
print("Test ROC AUC: {:.4f}".format(roc_auc))

NameError: name 'perceptron' is not defined

In [12]:
logistic = joblib.load("model_artifacts/logistic_model.joblib", mmap_mode=None)
y_train_pred = logistic.predict(X_tr_q1q2)

accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)
roc_auc = roc_auc_score(y_train, logistic.predict_proba(X_tr_q1q2)[:, 1])

print("Logistic Regression Metrics Train")
print("Train Accuracy: {:.4f}".format(accuracy))
print("Train Precision: {:.4f}".format(precision))
print("Train Recall: {:.4f}".format(recall))
print("Train F1-score: {:.4f}".format(f1))
print("Train ROC AUC: {:.4f}".format(roc_auc))

Logistic Regression Metrics Train
Train Accuracy: 0.8120
Train Precision: 0.7794
Train Recall: 0.6835
Train F1-score: 0.7283
Train ROC AUC: 0.8877


In [14]:
y_test_pred = logistic.predict(X_te_q1q2)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, logistic.predict_proba(X_te_q1q2)[:, 1])

print("Logistic Regression Metrics Test")
print("Train Accuracy: {:.4f}".format(accuracy))
print("Train Precision: {:.4f}".format(precision))
print("Train Recall: {:.4f}".format(recall))
print("Train F1-score: {:.4f}".format(f1))
print("Train ROC AUC: {:.4f}".format(roc_auc))

Logistic Regression Metrics Test
Train Accuracy: 0.7536
Train Precision: 0.6877
Train Recall: 0.6150
Train F1-score: 0.6493
Train ROC AUC: 0.8119
