# K-fold cross validation
n = 11589, means k = 5

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn import model_selection, naive_bayes, svm, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
from sklearn.model_selection import train_test_split, RepeatedKFold, StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.tree import export_graphviz
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

categories = pickle.load(open("darkweb/data/pekelbad/categorieen.pkl", "rb"))
tfvector = pickle.load(open("darkweb/data/pickle/tfidf_vectors.pkl", "rb"))

df = pd.DataFrame({"Category":categories})
df["CategoryID"] = df.Category.astype("category").cat.codes

In [2]:
X = tfvector
y = np.array(df['CategoryID'].tolist())
random = 42
#X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=random)

## Stratified k-fold
Stratification is the process of rearranging the data so as to ensure that each fold is a good representative of the whole. For example, in a binary classification problem where each class comprises of 50% of the data, it is best to arrange the data such that in every fold, each class comprises of about half the instances.

In [3]:
skf = StratifiedKFold(n_splits=10, random_state=random)
skf.get_n_splits(X, y)
for train_index, test_index in skf.split(X, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [4]:
X_train

<10437x359989 sparse matrix of type '<class 'numpy.float64'>'
	with 2767770 stored elements in Compressed Sparse Row format>

### Linear SVC

In [5]:
SVM = svm.LinearSVC()
SVM.fit(X_train, y_train)
predictions_SVM = SVM.predict(X_test)
print("SVM Accuracy Score: ", accuracy_score(predictions_SVM, y_test))
print("SVM Recall Score: ", recall_score(predictions_SVM, y_test, average='weighted'))
print("SVM F1 Score: ", f1_score(predictions_SVM, y_test, average='weighted'))

SVM Accuracy Score:  0.8524305555555556
SVM Recall Score:  0.8524305555555556
SVM F1 Score:  0.8670946288577871


### Multinomial NB

In [6]:
naive = naive_bayes.MultinomialNB()
naive.fit(X_train, y_train)
predictions_NB = naive.predict(X_test)
print("Multinomial NB Accuracy Score: ", accuracy_score(predictions_NB, y_test))
print("Multinomial NB Recall Score: ", recall_score(predictions_NB, y_test, average='weighted'))
print("Multinomial NB F1 Score: ", f1_score(predictions_NB, y_test, average='weighted'))

Multinomial NB Accuracy Score:  0.7300347222222222
Multinomial NB Recall Score:  0.7300347222222222
Multinomial NB F1 Score:  0.7931126542363844


### Gaussian NB

In [None]:
naive = naive_bayes.GaussianNB()
naive.fit(X_train.todense(), y_train)
predictions_NB = naive.predict(X_test.todense())
print("Gaussian NB Accuracy Score: ", accuracy_score(predictions_NB, y_test))
print("Gaussian NB Recall Score: ", recall_score(predictions_NB, y_test, average='weighted'))
print("Gaussian NB F1 Score: ", f1_score(predictions_NB, y_test, average='weighted'))

### Decision tree

In [10]:
class_tree = tree.DecisionTreeClassifier(max_depth=3)
class_tree.fit(X_train, y_train)
predictions_DT = class_tree.predict(X_test)
print("Decision Tree Accuracy Score: ", accuracy_score(predictions_DT, y_test))
print("Decision Tree Recall Score: ", recall_score(predictions_DT, y_test, average='weighted'))
print("Decision Tree F1 Score: ", f1_score(predictions_DT, y_test, average='weighted'))

Decision Tree Accuracy Score:  0.6458333333333334
Decision Tree Recall Score:  0.6458333333333334
Decision Tree F1 Score:  0.7603772279927702


### SGD

In [11]:
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-5, max_iter=1000, tol=1e-3, class_weight="balanced")
sgd = sgd.fit(X_train, y_train)
predictions_SGD = sgd.predict(X_test)
print("SGD Accuracy Score: ", accuracy_score(predictions_SGD, y_test))
print("SGD Recall Score: ", recall_score(predictions_SGD, y_test, average='weighted'))
print("SGD F1 Score: ", f1_score(predictions_SGD, y_test, average='weighted'))

SGD Accuracy Score:  0.8315972222222222
SGD Recall Score:  0.8315972222222222
SGD F1 Score:  0.823359385189021


### KNN

In [12]:
knn = KNeighborsClassifier(n_neighbors=12)
knn.fit(X_train, y_train)
predictions_KNN = knn.predict(X_test)
print("KNN Accuracy Score: ", accuracy_score(predictions_KNN, y_test))
print("KNN Recall Score: ", recall_score(predictions_KNN, y_test, average='weighted'))
print("KNN F1 Score: ", f1_score(predictions_KNN, y_test, average='weighted'))

KNN Accuracy Score:  0.8263888888888888
KNN Recall Score:  0.8263888888888888
KNN F1 Score:  0.8462218189517494


### Logistic regression

In [16]:
lr = LogisticRegression(random_state=random)
lr.fit(X_train,y_train)
predictions_LR = lr.predict(X_test)
print("LR Accuracy Score: ", accuracy_score(predictions_LR, y_test))
print("LR Recall Score: ", recall_score(predictions_LR, y_test, average='weighted'))
print("LR F1 Score: ", f1_score(predictions_LR, y_test, average='weighted'))

LR Accuracy Score:  0.8003472222222222
LR Recall Score:  0.8003472222222222
LR F1 Score:  0.832839441555754


### Random forest

In [17]:
rf = RandomForestClassifier(max_depth=10, random_state=random)
rf.fit(X_train,y_train)
predictions_RF = rf.predict(X_test)
print("RF Accuracy Score: ", accuracy_score(predictions_RF, y_test))
print("RF Recall Score: ", recall_score(predictions_RF, y_test, average='weighted'))
print("RF F1 Score: ", f1_score(predictions_RF, y_test, average='weighted'))

RF Accuracy Score:  0.6519097222222222
RF Recall Score:  0.6519097222222222
RF F1 Score:  0.7496451241732829


In [10]:
# scores = []
# best_svr = SVR(kernel='rbf')
# cv = KFold(n_splits=10, random_state=42, shuffle=False)
# for train_index, test_index in cv.split(X):
#     print("Train Index: ", train_index, "\n")
#     print("Test Index: ", test_index)

#     X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
#     best_svr.fit(X_train, y_train)
#     scores.append(best_svr.score(X_test, y_test))