In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from preprocessing import PreProcessor
import multiprocessing

from sklearn.model_selection import train_test_split
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
pp = PreProcessor()
df = pd.read_csv('darkweb/Balanced_Sample_500.csv') #500 records of all categories
df.columns = ['recordID','Category','Item','categoryID']
df['Tokens'] = df.apply(lambda d: pp.preprocess(str(d['Item'])),axis = 1)
df.head()

Unnamed: 0,recordID,Category,Item,categoryID,Tokens
0,40127,Counterfeits/Watches,Emporio Armani - AR1610 Shell Case ceramic bra...,0,emporio armani ar shell case ceram bracelet re...
1,40126,Counterfeits/Watches,Cartier-Tank Ladies Brand: Cartier Series: Tan...,0,cartiertank ladi brand cartier seri tank gende...
2,40125,Counterfeits/Watches,Patek Philippe watch box ★ Patek Philippe - Wa...,0,patek philipp watch box patek philipp watch bo...
3,40130,Counterfeits/Watches,Breitling - NAVITIMER COSMONAUTE 【Replica】 Wat...,0,breitl navitim cosmonaut replica watch inform ...
4,40129,Counterfeits/Watches,Emporio Armani Men's AR0397 Dial color Gary Wa...,0,emporio armani men ar dial color gari watch re...


In [3]:
tf = TfidfVectorizer(min_df=20, ngram_range=(1, 2))

features = tf.fit_transform(df['Tokens'])
labels = df['Category']
random = 42 #to keep same results

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=random)

## Training without hyperparameters

In [4]:
sgd = SGDClassifier(random_state=random)
sgd = sgd.fit(X_train, y_train)
predictions_SGD = sgd.predict(X_test)
print("SGD Accuracy Score: ", accuracy_score(predictions_SGD, y_test))
print("SGD Recall Score: ", recall_score(predictions_SGD, y_test, average='weighted'))
print("SGD F1 Score: ", f1_score(predictions_SGD, y_test, average='weighted'))

SGD Accuracy Score:  0.8864646464646465
SGD Recall Score:  0.8864646464646465
SGD F1 Score:  0.8898398849920407


In [6]:
sgd.get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'max_iter': None,
 'n_iter': None,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'tol': None,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

## Training with tweaking

Init signature: SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False, n_iter=None)

valid tweaking parameters: 
    - loss = l, varying the loss function could possibly change the way the algorithm calculates it.
    - penalty = p, this regularizes the data and with different methods it might bring sparsity to the model.
    - fit_intercept = fi, a boolean that checks whether the intercept should be estimated or not. 
    - warm_start = ws, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.
    - random_state = random, fixed in order to preserve same scores.
    
 NOT VALID:   
    - max_iter = mi, the maximum number of passes over the training data (aka epochs).
 I tried training on this, but after taking too long (>1 hour) and reviewing the scores over the max_iter, I concluded that this did not matter as for all max_iters the scores stayed the same.

In [7]:
li = ['hinge', 'log', 'modified_huber','squared_hinge', 'perceptron','squared_loss','huber', 
        'epsilon_insensitive','squared_epsilon_insensitive']
pe = ['l2', 'l1', 'elasticnet']
fi = [True, False]
# mi = [5,25,100,1000]
ws = [True, False]

reports=[]

for l in li:
    for p in pe:
        for f in fi:
            for w in ws:
                sgd = SGDClassifier(loss=l, penalty=p,fit_intercept=f, warm_start=w,random_state=random)
                sgd = sgd.fit(X_train, y_train)
                predictions_SGD = sgd.predict(X_test)
                f1 = f1_score(predictions_SGD, y_test, average='weighted')
                acc = accuracy_score(predictions_SGD, y_test)
                rec = recall_score(predictions_SGD, y_test, average='weighted')
                print(f"SGD: loss={l}, penalty={p}, fit_intercept={f}, warmstart={w} // F1-score: {f1}")
                reports.append((l,p,f,w,f1,acc,rec))

SGD: loss=hinge, penalty=l2, fit_intercept=True, warmstart=True // F1-score: 0.8898398849920407
SGD: loss=hinge, penalty=l2, fit_intercept=True, warmstart=False // F1-score: 0.8898398849920407
SGD: loss=hinge, penalty=l2, fit_intercept=False, warmstart=True // F1-score: 0.8879557983012283
SGD: loss=hinge, penalty=l2, fit_intercept=False, warmstart=False // F1-score: 0.8879557983012283
SGD: loss=hinge, penalty=l1, fit_intercept=True, warmstart=True // F1-score: 0.8652178235044028
SGD: loss=hinge, penalty=l1, fit_intercept=True, warmstart=False // F1-score: 0.8652178235044028
SGD: loss=hinge, penalty=l1, fit_intercept=False, warmstart=True // F1-score: 0.8688517964214952
SGD: loss=hinge, penalty=l1, fit_intercept=False, warmstart=False // F1-score: 0.8688517964214952
SGD: loss=hinge, penalty=elasticnet, fit_intercept=True, warmstart=True // F1-score: 0.8865376491587846
SGD: loss=hinge, penalty=elasticnet, fit_intercept=True, warmstart=False // F1-score: 0.8865376491587846
SGD: loss=hinge

SGD: loss=huber, penalty=elasticnet, fit_intercept=True, warmstart=True // F1-score: 0.8401791458943367
SGD: loss=huber, penalty=elasticnet, fit_intercept=True, warmstart=False // F1-score: 0.8401791458943367
SGD: loss=huber, penalty=elasticnet, fit_intercept=False, warmstart=True // F1-score: 0.8583991169959624
SGD: loss=huber, penalty=elasticnet, fit_intercept=False, warmstart=False // F1-score: 0.8583991169959624
SGD: loss=epsilon_insensitive, penalty=l2, fit_intercept=True, warmstart=True // F1-score: 0.8678051545153808
SGD: loss=epsilon_insensitive, penalty=l2, fit_intercept=True, warmstart=False // F1-score: 0.8678051545153808
SGD: loss=epsilon_insensitive, penalty=l2, fit_intercept=False, warmstart=True // F1-score: 0.8721358397483031
SGD: loss=epsilon_insensitive, penalty=l2, fit_intercept=False, warmstart=False // F1-score: 0.8721358397483031
SGD: loss=epsilon_insensitive, penalty=l1, fit_intercept=True, warmstart=True // F1-score: 0.8517985781882758
SGD: loss=epsilon_insensit

In [8]:
results = pd.DataFrame(reports, columns = ['loss','penalty','fit_intercept','warm_start','F1-score','Accuracy','Recall'])
results.sort_values(by=['F1-score'])
results

Unnamed: 0,loss,penalty,fit_intercept,warm_start,F1-score,Accuracy,Recall
0,hinge,l2,True,True,0.889840,0.886465,0.886465
1,hinge,l2,True,False,0.889840,0.886465,0.886465
2,hinge,l2,False,True,0.887956,0.884242,0.884242
3,hinge,l2,False,False,0.887956,0.884242,0.884242
4,hinge,l1,True,True,0.865218,0.860000,0.860000
...,...,...,...,...,...,...,...
103,squared_epsilon_insensitive,l1,False,False,0.034330,0.031111,0.031111
104,squared_epsilon_insensitive,elasticnet,True,True,0.030413,0.029899,0.029899
105,squared_epsilon_insensitive,elasticnet,True,False,0.030413,0.029899,0.029899
106,squared_epsilon_insensitive,elasticnet,False,True,0.026104,0.025657,0.025657
