In [None]:
import numpy as np
import pickle
import pandas as pd
import math

import sklearn
from sklearn import metrics 
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.model_selection import GridSearchCV

from matplotlib import pyplot as plt
import seaborn as sns


In [None]:
with open('dataframes\\final_wa_df.pickle', 'rb') as handle:
    wa_df= pickle.load(handle)

with open('dataframes\\final_stan_df.pickle', 'rb') as handle:
    stan_df = pickle.load(handle)

wa_content = wa_df.content
stan_content = stan_df.content

In [None]:
with open('dataframes\classifierDf.pickle', 'rb') as handle:
    manualRatedDf = pickle.load(handle)


In [None]:
manualRatedDf.loc[manualRatedDf["Match"] == 0]["DateDiff"].describe(percentiles=[.75, .90, .95, .99])

In [None]:
manualRatedDf[["Score", "tfidfTitle", "DateDiff"]].corr()

#### classifier test

##### LDA cosine sim

In [None]:
X = manualRatedDf[['Score', 'tfidfTitle', 'LDAMatch', 'DateDiff']]
y = manualRatedDf["Match"]

X_train,X_test,y_train,y_test=sklearn.model_selection.train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
X = manualRatedDf[['Score', 'tfidfTitle']]
y = manualRatedDf["Match"]

X_train,X_test,y_train,y_test=sklearn.model_selection.train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
with open('models\logistic_regression', 'rb') as handle:
    clf = pickle.load(handle)

In [None]:
params = [{
    # "C": [0.001,.01, .1, .5, 1, 10], 
    "penalty":["l2", "l1"],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'cv':[5, 10, 15]
}
]

clf = GridSearchCV(
    estimator=LogisticRegressionCV(),
    param_grid=params,
    refit=True,
    n_jobs=5,
    verbose=1
)

clf.fit(X_train, y_train)
print(clf.best_params_)
    

In [None]:
logreg = LogisticRegressionCV(penalty='l2', solver="liblinear", cv=10)

# fit the model with data
logreg.fit(X_train,y_train)

#
y_pred=logreg.predict(X_test)


In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 score:",metrics.f1_score(y_test, y_pred))
print("Specificity:",(cnf_matrix[0][0]/(cnf_matrix[0][0]+cnf_matrix[0][1])))

In [None]:
X_test

In [None]:
X_test.loc[X_test["Match"] == 0]["Score"].describe(percentiles=[0.05,.10, 0.20, 0.5,.75,.90,.99])

In [None]:
X_test["Match"] = y_test.tolist()
X_test.loc[X_test["Match"] == 1]["Score"].describe(percentiles=[0.05,.10, 0.20, 0.5,.75,.90,.99])

In [None]:
class_names=['No Match', "Match"] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="Logreg, auc="+str(auc))
plt.plot([0,1], [0,1], linestyle='--')
plt.legend(loc=4)
plt.show()

In [None]:
w0 = logreg.intercept_[0]
w = w1, w2, w3, w4 = logreg.coef_[0]
 
equation = "y = %f + (%f * x1) + (%f * x2) + (%f * x3) + (%f * x4)" % (w0, w1, w2, w3, w4)
print(equation)

In [None]:
feature_names=['Score', 'tfidfTitle', 'LDAMatch', 'DateDiff']
feature_importance = pd.DataFrame(feature_names, columns = ["Feature"])
feature_importance["Importance"] = pow(math.e, w)
feature_importance = feature_importance.sort_values(by = ["Importance"], ascending=False)

feature_importance

In [None]:
ax = feature_importance.plot.barh(x='Feature', y='Importance')
plt.show()

In [None]:
pickle.dump(logreg, open('models\logistic_regression', 'wb'))

### Simscore thresholding

In [None]:
X = manualRatedDf[['Score', 'Match']]
y = manualRatedDf["Match"]

X_train,X_test=sklearn.model_selection.train_test_split(X,test_size=0.25,random_state=0)

In [None]:
X_train

In [None]:
X_train.loc[X_train['Match'] == 0].describe(percentiles=[0.05, 0.10, .25, .50, .75, .90, .95, .99])

In [None]:
X_train.loc[X_train['Match'] == 1].describe(percentiles=[0.05, 0.10, .25, .50, .75, .90, .95, .99])

In [None]:
X_test.value_counts(['Match'])

In [None]:
threshold = 0.643
y_test = X_test['Match']
y_pred = [1 if float(score) >= threshold else 0 for score in X_test['Score'].tolist()]

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 score:",metrics.f1_score(y_test, y_pred))
print("Specificity:",(cnf_matrix[0][0]/(cnf_matrix[0][0]+cnf_matrix[0][1])))

CUT_OFF VALUE 0.611:
Accuracy: 0.8842105263157894
Precision: 0.8823529411764706
Recall: 0.9523809523809523
F1 score: 0.916030534351145
Specificity: 0.75

CUT OFF VALUE 0.643
Accuracy: 0.8947368421052632
Precision: 0.9344262295081968
Recall: 0.9047619047619048
F1 score: 0.9193548387096775
Specificity: 0.875


In [None]:
class_names=['No Match', "Match"] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
def plot_ROC(fpr, tpr):
    plt.plot(fpr, tpr)
    plt.plot([0,1], [0,1], linestyle='--')
    

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')

    plt.show()

In [None]:
fpr, tpr, _ = metrics.roc_curve(manualRatedDf["Match"].tolist(), manualRatedDf["Score"].tolist())
auc = metrics.auc(fpr, tpr)

plot_ROC(fpr, tpr)

In [None]:
print(auc)

## decision tree

In [None]:
X = manualRatedDf[['Score', 'tfidfTitle', 'LDAMatch', 'DateDiff']]
y = manualRatedDf["Match"]

X_train,X_test,y_train,y_test=sklearn.model_selection.train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
X = manualRatedDf[['Score', 'tfidfTitle']]
y = manualRatedDf["Match"]

X_train,X_test,y_train,y_test=sklearn.model_selection.train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
# clf = tree.DecisionTreeClassifier(ccp_alpha=0.005,criterion="gini", max_depth=2, max_features='sqrt', splitter='best')
clf = tree.DecisionTreeClassifier(ccp_alpha=0.005,criterion="gini", max_depth=4, splitter='best')
clf.fit(X_train, y_train)

In [None]:
with open('models\decision_tree_allfeatures', 'rb') as handle:
    clf = pickle.load(handle)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 score:",metrics.f1_score(y_test, y_pred))
print("Specificity:",(cnf_matrix[0][0]/(cnf_matrix[0][0]+cnf_matrix[0][1])))

In [None]:
class_names=['No Match', "Match"] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="Decision tree, auc="+str(auc))
plt.plot([0,1], [0,1], linestyle='--')
plt.legend(loc=4)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'criterion':  ['gini', 'entropy'],
    'ccp_alpha': [.005, .01, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6],
    'max_depth':  [None, 2,3, 4, 6, 8, 10],
    'max_features': [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8, 1, 2, 3, 4],
    'splitter': ['best', 'random']
}

clf = GridSearchCV(
    estimator=tree.DecisionTreeClassifier(random_state=0),
    scoring='recall',
    param_grid=params,
    cv=5,
    n_jobs=5,
    verbose=1
)

clf.fit(X_train, y_train)
print(clf.best_params_)

{'ccp_alpha': 0.005, 'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'splitter': 'best'}

In [None]:
y_train.value_counts()

In [None]:

tree.plot_tree(clf, filled=True, fontsize=10, feature_names=['Score', 'tfidfTitle', 'LDAMatch', "DateDiff"])
plt.figure(figsize=(12,12))
# plt.savefig('decision_tree_simscore_allfeatures2', dpi=100)


In [None]:
importance = clf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
pickle.dump(clf, open('models\decision_tree_allfeatures', 'wb'))