# imports

In [2]:
import pickle
import matplotlib.pyplot as plt
import graphviz 
from sklearn import model_selection, naive_bayes, svm, tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.tree import export_graphviz
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

categories = pickle.load(open("darkweb/data/pickle_bu/categorieen.pkl", "rb"))
tfvector = pickle.load(open("darkweb/data/pickle_bu/tfidf_vectors.pkl", "rb"))

# TFIDF
**##note to self
in order to predict later on, the text in the predict function needs to be vectorized using tfidf ofcourse**

In [3]:
len(categories), tfvector

(11589, <11589x359989 sparse matrix of type '<class 'numpy.float64'>'
 	with 3081091 stored elements in Compressed Sparse Row format>)

In [4]:
labels = categories
features = tfvector
random = 42
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=random)

## SVM (linear SVC)

In [5]:
SVM = svm.LinearSVC()
SVM.fit(X_train, y_train)
predictions_SVM = SVM.predict(X_test)
print("SVM Accuracy Score: ", accuracy_score(predictions_SVM, y_test))
print("SVM Recall Score: ", recall_score(predictions_SVM, y_test, average='weighted'))
print("SVM F1 Score: ", f1_score(predictions_SVM, y_test, average='weighted'))

SVM Accuracy Score:  0.8426143790849673
SVM Recall Score:  0.8426143790849673
SVM F1 Score:  0.8634566895491402


## multinomial naive bayes

In [6]:
naive = naive_bayes.MultinomialNB()
naive.fit(X_train, y_train)
predictions_NB = naive.predict(X_test)
print("Multinomial NB Accuracy Score: ", accuracy_score(predictions_NB, y_test))
print("Multinomial NB Recall Score: ", recall_score(predictions_NB, y_test, average='weighted'))
print("Multinomial NB F1 Score: ", f1_score(predictions_NB, y_test, average='weighted'))

Multinomial NB Accuracy Score:  0.709281045751634
Multinomial NB Recall Score:  0.709281045751634
Multinomial NB F1 Score:  0.7835957838983414


## Gaussian naive bayes

In [7]:
naive = naive_bayes.GaussianNB()
naive.fit(X_train.todense(), y_train)
predictions_NB = naive.predict(X_test.todense())
print("Gaussian NB Accuracy Score: ", accuracy_score(predictions_NB, y_test))
print("Gaussian NB Recall Score: ", recall_score(predictions_NB, y_test, average='weighted'))
print("Gaussian NB F1 Score: ", f1_score(predictions_NB, y_test, average='weighted'))

Gaussian NB Accuracy Score:  0.8143790849673203
Gaussian NB Recall Score:  0.8143790849673203
Gaussian NB F1 Score:  0.8125811229192027


## Bernoulli NB (same results as multinomial)

In [11]:
bernoulli = naive_bayes.BernoulliNB()
bernoulli.fit(X_train.toarray(), y_train)
predictions_B = naive.predict(X_test.toarray())
print("Bernoulli NB Accuracy Score: ", accuracy_score(predictions_B, y_test))
print("Bernoulli NB Recall Score: ", recall_score(predictions_B, y_test, average='weighted'))
print("Bernoulli NB F1 Score: ", f1_score(predictions_B, y_test, average='weighted'))

Bernoulli NB Accuracy Score:  0.8143790849673203
Bernoulli NB Recall Score:  0.8143790849673203
Bernoulli NB F1 Score:  0.8125811229192027


## Random forest

In [12]:
# Limit max depth
rforest = RandomForestClassifier(max_depth = 3, n_estimators=10, random_state=random)
rforest.fit(X_train, y_train)
estimator_limited = rforest.estimators_[5]

# No max depth
rforest = RandomForestClassifier(max_depth = None, n_estimators=10, random_state=random)
rforest.fit(X_train, y_train)
estimator_nonlimited = rforest.estimators_[5]

In [13]:
# export_graphviz(estimator_limited, out_file='tree_limited.dot', feature_names = features,
#                 class_names = labels,
#                 rounded = True, proportion = False, precision = 2, filled = True)

# export_graphviz(estimator_nonlimited, out_file='tree_nonlimited.dot', feature_names = features,
#                 class_names = labels,
#                 rounded = True, proportion = False, precision = 2, filled = True)

In [14]:
rf = RandomForestClassifier(max_depth=10, random_state=random)
rf.fit(X_train,y_train)
predictions_RF = rf.predict(X_test)
print("RF Accuracy Score: ", accuracy_score(predictions_RF, y_test))
print("RF Recall Score: ", recall_score(predictions_RF, y_test, average='weighted'))
print("RF F1 Score: ", f1_score(predictions_RF, y_test, average='weighted'))

RF Accuracy Score:  0.6515032679738562
RF Recall Score:  0.6515032679738562
RF F1 Score:  0.737896270507299


## decision tree

In [15]:
# def unique(categories): 
#     unique_list = [] 
#     for x in categories:  
#         if x not in unique_list: 
#             unique_list.append(x) 

In [16]:
class_tree = tree.DecisionTreeClassifier(max_depth=3)
class_tree.fit(X_train, y_train)
predictions_DT = class_tree.predict(X_test)
print("Decision Tree Accuracy Score: ", accuracy_score(predictions_DT, y_test))
print("Decision Tree Recall Score: ", recall_score(predictions_DT, y_test, average='weighted'))
print("Decision Tree F1 Score: ", f1_score(predictions_DT, y_test, average='weighted'))

Decision Tree Accuracy Score:  0.6368627450980392
Decision Tree Recall Score:  0.6368627450980392
Decision Tree F1 Score:  0.7530245119326625


In [18]:
# from sklearn.decomposition.truncated_svd import TruncatedSVD 

# pca = TruncatedSVD(n_components=2)
# X_reduced_train = pca.fit_transform(features)

# dot_data = tree.export_graphviz(class_tree, out_file=None, feature_names=X_train, class_names = y_train,  
#                      filled=True, rounded=True, special_characters=True)  
# graph = graphviz.Source(dot_data)  
# graph.render("Decision_tree_viz")

## SGD

In [19]:
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-5, max_iter=1000, tol=1e-3, class_weight="balanced")
sgd = sgd.fit(X_train, y_train)
predictions_SGD = sgd.predict(X_test)
print("SGD Accuracy Score: ", accuracy_score(predictions_SGD, y_test))
print("SGD Recall Score: ", recall_score(predictions_SGD, y_test, average='weighted'))
print("SGD F1 Score: ", f1_score(predictions_SGD, y_test, average='weighted'))

SGD Accuracy Score:  0.8342483660130718
SGD Recall Score:  0.8342483660130718
SGD F1 Score:  0.8294909921789023


In [20]:
# fit = TSNE()
# u = fit.fit_transform(X_train.todense())

## KNN

In [21]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
predictions_KNN = knn.predict(X_test)
print("KNN Accuracy Score: ", accuracy_score(predictions_KNN, y_test))
print("KNN Recall Score: ", recall_score(predictions_KNN, y_test, average='weighted'))
print("KNN F1 Score: ", f1_score(predictions_KNN, y_test, average='weighted'))

KNN Accuracy Score:  0.8366013071895425
KNN Recall Score:  0.8366013071895425
KNN F1 Score:  0.8519283854876076


## Logistic Regression

In [22]:
lr = LogisticRegression(random_state=random)
lr.fit(X_train,y_train)
predictions_LR = lr.predict(X_test)
print("LR Accuracy Score: ", accuracy_score(predictions_LR, y_test))
print("LR Recall Score: ", recall_score(predictions_LR, y_test, average='weighted'))
print("LR F1 Score: ", f1_score(predictions_LR, y_test, average='weighted'))

LR Accuracy Score:  0.781437908496732
LR Recall Score:  0.781437908496732
LR F1 Score:  0.8260280735567747


## kmeans

In [23]:
true_k = 12
kmeans = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1).fit(X_train)