In [1]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to sys.path if not already there
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.data.data_loader import get_top_words, train_data, test_data, data_transformed
from src.features.build_features import build_X
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from scipy.stats import uniform, loguniform
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import joblib
import time

In [3]:
# load train and test data
newsgroups_train = train_data()
newsgroups_test = test_data()

In [4]:
df_train = data_transformed(data = newsgroups_train)
df_test = data_transformed(data = newsgroups_test)
df_train

Unnamed: 0,Target,Text,Target Article Category,Article Length
0,7,I was wondering if anyone out there could enli...,rec.autos,475
1,4,A fair number of brave souls who upgraded thei...,comp.sys.mac.hardware,530
2,4,"well folks, my mac plus finally gave up the gh...",comp.sys.mac.hardware,1659
3,1,\nDo you have Weitek's address/phone number? ...,comp.graphics,95
4,14,"From article <C5owCB.n3p@world.std.com>, by to...",sci.space,448
...,...,...,...,...
11309,13,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,sci.med,1782
11310,4,"I have a (very old) Mac 512k and a Mac Plus, b...",comp.sys.mac.hardware,674
11311,3,I just installed a DX2-66 CPU in a clone mothe...,comp.sys.ibm.pc.hardware,581
11312,1,\nWouldn't this require a hyper-sphere. In 3-...,comp.graphics,311


In [5]:
model_list = []

In [6]:
# Complement Naive Bayes with custom feature
preprocessor = ColumnTransformer(
    transformers=[
        ('num_scaler', MinMaxScaler(), ['Article Length']), 
        ('text_vectorizer', TfidfVectorizer(), 'Text') 
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ComplementNB())
])

param_distributions = {
    'preprocessor__text_vectorizer__max_features': [30000, 48000, 50000], 
    'preprocessor__text_vectorizer__stop_words': ['english'],
    'preprocessor__text_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 
    'preprocessor__text_vectorizer__min_df': (1, 3, 5, 10), 
    'preprocessor__text_vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
    'preprocessor__text_vectorizer__norm': ('l1', 'l2'),
    'classifier__alpha': np.logspace(-6, 6, 13)
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1
)

t1 = time.perf_counter()
random_search.fit(df_train, df_train['Target Article Category'])
print('Time elapsed: {}'.format(time.perf_counter() - t1))
print(f"\nMean cross-validation accuracy with best parameters: {random_search.best_score_:.4f}")

model_metatdata = {
    'model': random_search,
    'metadata': {
        'time_to_train': time.perf_counter() - t1,
        'training_date': time.strftime("%Y-%m-%d %H:%M:%S"),
        'CV_score_best_model': random_search.best_score_,
        'model_type': '*Complement Naive Bayes',
        'hyperparameters': random_search.get_params()
    }
}

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Time elapsed: 75.58800270000938

Mean cross-validation accuracy with best parameters: 0.7592


In [7]:
model_list.append(model_metatdata)

In [8]:
# Complement Naive Bayes
preprocessor = ColumnTransformer(
    transformers=[
        ('text_vectorizer', TfidfVectorizer(), 'Text') 
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ComplementNB())
])

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1
)

t1 = time.perf_counter()
random_search.fit(df_train, df_train['Target Article Category'])
print('Time elapsed: {}'.format(time.perf_counter() - t1))
print(f"\nMean cross-validation accuracy with best parameters: {random_search.best_score_:.4f}")

model_metatdata = {
    'model': random_search,
    'metadata': {
        'time_to_train': time.perf_counter() - t1,
        'training_date': time.strftime("%Y-%m-%d %H:%M:%S"),
        'CV_score_best_model': random_search.best_score_,
        'model_type': 'Complement Naive Bayes',
        'hyperparameters': random_search.get_params()
    }
}

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Time elapsed: 71.12886850000359

Mean cross-validation accuracy with best parameters: 0.7591


In [9]:
model_list.append(model_metatdata)

In [10]:
# Logistic Regression Model with custom feature
preprocessor = ColumnTransformer(
    transformers=[
        ('num_scaler', MinMaxScaler(), ['Article Length']), 
        ('text_vectorizer', TfidfVectorizer(), 'Text') 
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

param_distributions = {
    'preprocessor__text_vectorizer__max_features': [30000, 48000, 50000], 
    'preprocessor__text_vectorizer__stop_words': ['english'],
    'preprocessor__text_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 
    'preprocessor__text_vectorizer__min_df': (1, 3, 5, 10), 
    'preprocessor__text_vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
    'preprocessor__text_vectorizer__norm': ('l1', 'l2'),
    'classifier__C': uniform(loc=0.01, scale=100),
    'classifier__penalty': ['l1', 'l2'], 
    'classifier__solver': ['liblinear', 'saga']
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1
)

t1 = time.perf_counter()
random_search.fit(df_train, df_train['Target Article Category'])
print('Time elapsed: {}'.format(time.perf_counter() - t1))
print(f"\nMean cross-validation accuracy with best parameters: {random_search.best_score_:.4f}")

model_metatdata = {
    'model': random_search,
    'metadata': {
        'time_to_train': time.perf_counter() - t1,
        'training_date': time.strftime("%Y-%m-%d %H:%M:%S"),
        'CV_score_best_model': random_search.best_score_,
        'model_type': '*Logistic Regression',
        'hyperparameters': random_search.get_params()
    }
}

Fitting 5 folds for each of 40 candidates, totalling 200 fits




Time elapsed: 1546.5834833999397

Mean cross-validation accuracy with best parameters: 0.7564


In [11]:
model_list.append(model_metatdata)

In [12]:
# Logistic Regression Model
preprocessor = ColumnTransformer(
    transformers=[ 
        ('text_vectorizer', TfidfVectorizer(), 'Text') 
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

param_distributions = {
    'preprocessor__text_vectorizer__max_features': [30000, 48000, 50000], 
    'preprocessor__text_vectorizer__stop_words': ['english'],
    'preprocessor__text_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 
    'preprocessor__text_vectorizer__min_df': (1, 3, 5, 10), 
    'preprocessor__text_vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
    'preprocessor__text_vectorizer__norm': ('l1', 'l2'),
    'classifier__C': uniform(loc=0.01, scale=100),
    'classifier__penalty': ['l1', 'l2'], 
    'classifier__solver': ['liblinear', 'saga']
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1
)

t1 = time.perf_counter()
random_search.fit(df_train, df_train['Target Article Category'])
print('Time elapsed: {}'.format(time.perf_counter() - t1))
print(f"\nMean cross-validation accuracy with best parameters: {random_search.best_score_:.4f}")

model_metatdata = {
    'model': random_search,
    'metadata': {
        'time_to_train': time.perf_counter() - t1,
        'training_date': time.strftime("%Y-%m-%d %H:%M:%S"),
        'CV_score_best_model': random_search.best_score_,
        'model_type': 'Logistic Regression',
        'hyperparameters': random_search.get_params()
    }
}

Fitting 5 folds for each of 40 candidates, totalling 200 fits




Time elapsed: 1585.1439401999814

Mean cross-validation accuracy with best parameters: 0.7565


In [13]:
model_list.append(model_metatdata)

In [14]:
# SVC classifier with custom feature
preprocessor = ColumnTransformer(
    transformers=[
        ('num_scaler', MinMaxScaler(), ['Article Length']), 
        ('text_vectorizer', TfidfVectorizer(), 'Text') 
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(random_state=42, dual=False))
])

param_distributions = {
    'preprocessor__text_vectorizer__max_features': [30000, 48000, 50000], 
    'preprocessor__text_vectorizer__stop_words': ['english'],
    'preprocessor__text_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 
    'preprocessor__text_vectorizer__min_df': (1, 3, 5, 10), 
    'preprocessor__text_vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
    'preprocessor__text_vectorizer__norm': ('l1', 'l2'),
    'classifier__C': loguniform(0.01, 100),
    'classifier__loss': ['squared_hinge'],
    'classifier__penalty': ['l2'], 
    'classifier__tol': loguniform(1e-5, 1e-3)
}

random_search_ = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1
)

t1 = time.perf_counter()
random_search.fit(df_train, df_train['Target Article Category'])
print('Time elapsed: {}'.format(time.perf_counter() - t1))
print(f"\nMean cross-validation accuracy with best parameters: {random_search.best_score_:.4f}")

model_metatdata = {
    'model': random_search,
    'metadata': {
        'time_to_train': time.perf_counter() - t1,
        'training_date': time.strftime("%Y-%m-%d %H:%M:%S"),
        'CV_score_best_model': random_search.best_score_,
        'model_type': '*SVM',
        'hyperparameters': random_search.get_params()
    }
}

Fitting 5 folds for each of 40 candidates, totalling 200 fits




Time elapsed: 1382.1953193000518

Mean cross-validation accuracy with best parameters: 0.7565


In [15]:
model_list.append(model_metatdata)

In [16]:
# SVC classifier
preprocessor = ColumnTransformer(
    transformers=[
        ('text_vectorizer', TfidfVectorizer(), 'Text') 
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(random_state=42, dual=False))
])

param_distributions = {
    'preprocessor__text_vectorizer__max_features': [30000, 48000, 50000], 
    'preprocessor__text_vectorizer__stop_words': ['english'],
    'preprocessor__text_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 
    'preprocessor__text_vectorizer__min_df': (1, 3, 5, 10), 
    'preprocessor__text_vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
    'preprocessor__text_vectorizer__norm': ('l1', 'l2'),
    'classifier__C': loguniform(0.01, 100),
    'classifier__loss': ['squared_hinge'],
    'classifier__penalty': ['l2'], 
    'classifier__tol': loguniform(1e-5, 1e-3)
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1
)

t1 = time.perf_counter()
random_search.fit(df_train, df_train['Target Article Category'])
print('Time elapsed: {}'.format(time.perf_counter() - t1))
print(f"\nMean cross-validation accuracy with best parameters: {random_search.best_score_:.4f}")

model_metatdata = {
    'model': random_search,
    'metadata': {
        'time_to_train': time.perf_counter() - t1,
        'training_date': time.strftime("%Y-%m-%d %H:%M:%S"),
        'CV_score_best_model': random_search.best_score_,
        'model_type': 'SVM',
        'hyperparameters': random_search.get_params()
    }
}

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Time elapsed: 137.74850159999914

Mean cross-validation accuracy with best parameters: 0.7611


In [17]:
model_list.append(model_metatdata)

In [18]:
# KNN with custom features
preprocessor = ColumnTransformer(
    transformers=[
        ('num_scaler', MinMaxScaler(), ['Article Length']), 
        ('text_vectorizer', TfidfVectorizer(), 'Text') 
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier',KNeighborsClassifier())
])

param_distributions = {
    'preprocessor__text_vectorizer__max_features': [5000, 10000, 20000], 
    'preprocessor__text_vectorizer__stop_words': ['english'],
    'preprocessor__text_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 
    'preprocessor__text_vectorizer__min_df': (1, 3, 5, 10), 
    'preprocessor__text_vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
    'preprocessor__text_vectorizer__norm': ('l1', 'l2'),
    'classifier__n_neighbors': [50, 100, 200],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
    'classifier__leaf_size': [10, 20, 30, 50, 100],
    'classifier__p': [1, 2]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1
)

t1 = time.perf_counter()
random_search.fit(df_train, df_train['Target Article Category'])
print('Time elapsed: {}'.format(time.perf_counter() - t1))
print(f"\nMean cross-validation accuracy with best parameters: {random_search.best_score_:.4f}")

model_metatdata = {
    'model': random_search,
    'metadata': {
        'time_to_train': time.perf_counter() - t1,
        'training_date': time.strftime("%Y-%m-%d %H:%M:%S"),
        'CV_score_best_model': random_search.best_score_,
        'model_type': '*KNN',
        'hyperparameters': random_search.get_params()
    }
}

Fitting 5 folds for each of 40 candidates, totalling 200 fits


 0.07574693        nan 0.16351447        nan 0.12754069 0.07380257
 0.09183259 0.08714843 0.0740679  0.12709946        nan 0.12338699
 0.07990102 0.09015352        nan        nan        nan 0.09342317
 0.07150438 0.14309729 0.0705323  0.08308275 0.0726533         nan
        nan 0.12930923 0.06602415 0.06063284        nan 0.09430675
 0.07406751 0.11896765 0.10570907 0.08652974]


Time elapsed: 89.87413700006437

Mean cross-validation accuracy with best parameters: 0.1635


In [19]:
model_list.append(model_metatdata)

In [20]:
# KNN 
preprocessor = ColumnTransformer(
    transformers=[
        ('text_vectorizer', TfidfVectorizer(), 'Text') 
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier',KNeighborsClassifier())
])

param_distributions = {
    'preprocessor__text_vectorizer__max_features': [5000, 10000, 20000], 
    'preprocessor__text_vectorizer__stop_words': ['english'],
    'preprocessor__text_vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 
    'preprocessor__text_vectorizer__min_df': (1, 3, 5, 10), 
    'preprocessor__text_vectorizer__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
    'preprocessor__text_vectorizer__norm': ('l1', 'l2'),
    'classifier__n_neighbors': [50, 100, 200],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
    'classifier__leaf_size': [10, 20, 30, 50, 100],
    'classifier__p': [1, 2]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1
)

t1 = time.perf_counter()
random_search.fit(df_train, df_train['Target Article Category'])
print('Time elapsed: {}'.format(time.perf_counter() - t1))
print(f"\nMean cross-validation accuracy with best parameters: {random_search.best_score_:.4f}")

model_metatdata = {
    'model': random_search,
    'metadata': {
        'time_to_train': time.perf_counter() - t1,
        'training_date': time.strftime("%Y-%m-%d %H:%M:%S"),
        'CV_score_best_model': random_search.best_score_,
        'model_type': 'KNN',
        'hyperparameters': random_search.get_params()
    }
}

Fitting 5 folds for each of 40 candidates, totalling 200 fits


 0.06708523        nan 0.16201118        nan 0.17129238 0.05303168
 0.11949733 0.08370175 0.06717357 0.08511592        nan 0.07954727
 0.07919427 0.09094951        nan        nan        nan 0.06575925
 0.05329678 0.09307036 0.06735037 0.08714843 0.0529433         nan
        nan 0.1247129  0.07194522 0.05957191        nan 0.10553255
 0.05320848 0.1139295  0.11118899 0.09519132]


Time elapsed: 87.49200219998602

Mean cross-validation accuracy with best parameters: 0.1713




In [21]:
model_list.append(model_metatdata)

In [22]:
models_filename = 'multiple_classification_models.joblib'
joblib.dump(model_list, models_filename)

['multiple_classification_models.joblib']