In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load data and visualize data

Get the path of project

In [None]:
import os

In [None]:
os.getcwd() 

In [None]:
project_root_path = os.getcwd()
project_root_path

In [None]:
data_root_path = project_root_path + "\\_data\\"
data_root_path

In [None]:
# get the path of dataset
data_path = data_root_path + "20_newsgroups\\"

Load data

In [None]:
from time import time

In [None]:
start_time = time()
data = []
labels = []
labels_count = {}
# list all folder from root_folder
for category in os.listdir(data_path):
    
    # read each child folder in data folder
    if category.lower() != '.ds_store':
        # add label to dictationary
        labels_count[category] = 0
        for document in os.listdir(data_path + category):
            # read text file in each folder
            with open(data_path + category + "/" + document, "r", encoding="utf-8", errors="ignore") as textfile:
                contents = textfile.read() 
                # add context to list data
                data.append(contents)
                # add label to list labels
                labels.append(category)
                # count number of label
                labels_count[category] += 1
print("Time to load data: " + str(time() - start_time) + 's')

Load the dataset in DataFrame

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame()
df['text'] = data
df['label'] = labels
df.head()

Count the number of each label

In [None]:
# count the number of each label
df['label'].value_counts()

In [None]:
# number of targets
df['label'].nunique()

View in Barchart

In [None]:
import matplotlib.pyplot as plt

# Figure Size
fig, ax = plt.subplots(figsize =(16, 9))
# creating the bar plot
ax.barh(list(labels_count.keys()), list(labels_count.values()), color='grey')
# Add x, y gridlines# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
ax.set_title("Number of data in different News type",loc ='left')
plt.show()

Data is balanced
We are going to make a number of words column in which there is the number of words in a particular text

In [None]:
# Count the number of words
df['number_of_words'] = df['text'].apply(lambda x:len(str(x).split()))
df.head()

Check the basic stats of number of words, like maximum, minimum, average number of words

In [None]:
df['number_of_words'].describe()

So maximum number of words text is belongs to electronics category.
In our dataset we have some rows where there are no text at all i.e. the number of words is 0.We will drop those rows

In [None]:
# number of rows with text length = 0
no_text = df[df['number_of_words'] == 0]
print(len(no_text))

# drop these rows
df.drop(no_text.index,inplace=True)

### Visualize the frequency distribution of number of words for each text extracted

In [None]:
import seaborn as sns

plt.style.use('ggplot')
plt.figure(figsize=(12,6))
sns.distplot(df['number_of_words'], kde = False, color="red", bins=200)
plt.title("Frequency distribution of number of words for each text extracted", size=20)

# Data Pre-Processing

### Clean the dataset

In [None]:
import re
import string

In [None]:
def clean_header(text):
    text = re.sub(r'(From:\s+[^\n]+\n)', '', text)
    text = re.sub(r'(Subject:[^\n]+\n)', '', text)
    text = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', text)
    text = re.sub(r'(Last-modified:[^\n]+\n)', '', text)
    text = re.sub(r'(Version:[^\n]+\n)', '', text)

    return text

def clean_text(text):
    # remove header
    text = clean_header(text)
    # lower text
    text = text.lower()
    # remove text in square brackets
    text = re.sub('\[.*?\]', ' ', text)
    # remove link
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    # remove email
    text = re.sub(r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',' ', text)
    # remove HTML tag
    text = re.sub('<.*?>+', ' ', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    # remove special characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # remove empty line
    text = re.sub('\n', ' ', text)
    # remove words containing numbers.
    text = re.sub('\w*\d\w*', ' ', text)
    # remove extra whitespaces
    text = re.sub(' +', ' ', text)
    # remove single character
    text = ' '.join([word for word in text.split() if len(word) > 1])
    return text.strip()

In [None]:
start_time = time()
df['cleaned_text'] = df['text'].apply(clean_text)
print("Time to process data: " + str(time() - start_time) + 's')

In [None]:
df['cleaned_text'].head()

Count the number of words again

In [None]:
df['number_of_cleaned_words'] = df['cleaned_text'].apply(lambda x:len(str(x).split()))
df.head()

In [None]:
df['number_of_cleaned_words'].describe()

In [None]:
# number of rows with text length = 0
no_text = df[df['number_of_cleaned_words']==0]
print(len(no_text))

# drop these rows
df.drop(no_text.index,inplace=True)

### Remove stop words

Let's convert our cleaned text into tokens

In [None]:
df['tokens'] = df['cleaned_text'].apply(lambda x: x.split())
df.head()

Stopwords are those english words which do not add much meaning to a sentence.
They are very commonly used words and we do not required those words. 
So we can remove those stopwords

In [None]:
# stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print(ENGLISH_STOP_WORDS)

Check number of stopwords in library

In [None]:
len(ENGLISH_STOP_WORDS)

We are going to remove the stopwords from the text

In [None]:
# removing stopwords
stop_words = ENGLISH_STOP_WORDS

def remove_stopwords(text):
    words = [word for word in text if word not in stop_words]
    return words 
df['stopwords_remove_tokens'] = df['tokens'].apply(lambda x : remove_stopwords(x))
df.head()

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [None]:
def lem_word(x):
    return [lem.lemmatize(w, pos="v") for w in x]

df['lemmatized_text'] = df['stopwords_remove_tokens'].apply(lem_word)
df.head()

### Combine the text

In [None]:
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

df['final_text'] = df['lemmatized_text'].apply(lambda x : combine_text(x))
df.head()

In [None]:
df['final_number_of_words'] = df['final_text'].apply(lambda x:len(str(x).split()))
df.head()

In [None]:
df['final_number_of_words'].describe()

In [None]:
# number of rows with text length = 0
no_text = df[df['final_number_of_words'] == 0]
print(len(no_text))

# drop these rows
df.drop(no_text.index,inplace=True)

### Visualize the frequency distribution of number of words for each text extracted

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(12,6))
sns.distplot(df['final_number_of_words'], kde = False,color="red", bins=200)
plt.title("Frequency distribution of number of words for each text extracted", size=20)

In [None]:
df.to_csv(data_root_path + "data.csv", index=False)

# Feature extraction

In [None]:
cd /content/drive/MyDrive/ML-Projects-MidTerm/

In [None]:
import os
data_path = os.getcwd() + '/data/data.csv'
data_path

In [None]:
import pandas as pd

df = pd.read_csv(data_path)

### Our text has been cleaned, we will convert the labels into numeric values using LableEncoder()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# label_encoder object knows how to understand word labels.
label_encoder = LabelEncoder()
  
# Encode labels in column 'label'.
df['target']= label_encoder.fit_transform(df['label'])
  
df['target'].unique()

Variable

In [None]:
X = df['final_text']
y = df['target']

In [None]:
X.shape, y.shape

Split the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_data, X_test, y_data, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
X_data.shape, X_test.shape, y_data.shape, y_test.shape

### TF - IDF

Tf-Idf stands for Term Frequency-Inverse Document Frequency. It is a techinque to quantify a word in documents, we generally compute a weight to each word which signifies the importance of the word which signifies the importance of the word in the document and corpus.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word',stop_words='english', max_df=0.8, min_df=3, sublinear_tf=True)
# tfidf_vectorizer = CountVectorizer(analyzer='word',stop_words='english', max_df=0.8, min_df=3)

In [None]:
tfidf_vectorizer.fit(X_data)
X_data = tfidf_vectorizer.transform(X_data)
X_test = tfidf_vectorizer.transform(X_test)

In [None]:
X_data.shape, X_test.shape

Check the vocabulary

In [None]:
len(tfidf_vectorizer.vocabulary_)

After performing TF-IDF, we can easily see that the matrix we obtained has a very large size, and the computational processing with this matrix requires quite expensive time and memory. To handle this problem, we will use the SVD (singular value decomposition) algorithm, which aims to reduce the data dimension of the resulting matrix, while keeping the properties of the original matrix.

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
svd = TruncatedSVD(n_components=300)

In [None]:
svd.fit(X_data)
X_data = svd.transform(X_data)
X_test = svd.transform(X_test)

In [None]:
X_data.shape, X_test.shape

# Build the model

Firstly, we create containers to save time and accuracy

In [None]:
training_time_container = {'linear_clf':0, 'knn':0,'ann':0}
prediction_time_container = {'linear_clf':0, 'knn':0,'ann':0}
accuracy_container = {'linear_clf':{'train': 0, 'val':0,'test':0},'knn':{'train': 0, 'val':0,'test':0},
                      'ann':{'train': 0, 'val':0,'test':0}}
f1_macro_container = {'linear_clf':{'train': 0, 'val':0,'test':0},'knn':{'train': 0, 'val':0,'test':0},
                      'ann':{'train': 0, 'val':0,'test':0}}

In order to keep the code concise, we will use the same training and prediction function for all models, which greatly reduces our coding time.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras import models, optimizers
from tensorflow.keras.callbacks import EarlyStopping 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from time import time
import numpy as np

# train_model function
def train_model(model, X_data, y_data, X_test, y_test, model_name, 
                is_neuralnet=False, epochs=3, batch_size=128, show_cm=False):       
    X_train, X_val, y_train, y_val = train_test_split(X_data, y_data,
                                test_size=0.2, random_state=42)
    
    if is_neuralnet:
        earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')
        # Fitting the model 
        start_train_time = time()
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                            epochs=epochs, batch_size=batch_size,callbacks=[earlystop])
        training_time_container[model_name] = round(time() - start_train_time,4)
        
        # Predicting the Test and Val set results
        start_predict_time = time()
        # Predict probability
        train_predictions = model.predict(X_train)
        val_predictions = model.predict(X_val)
        test_predictions = model.predict(X_test)
        # Get predict
        train_predictions = train_predictions.argmax(axis=-1)
        val_predictions = val_predictions.argmax(axis=-1)
        test_predictions = test_predictions.argmax(axis=-1)
        prediction_time_container[model_name] = round(time() - start_predict_time,4)

        acc = history.history['accuracy']
        val_acc = history.history['val_accuracy']
        loss = history.history['loss']
        val_loss = history.history['val_loss']

        epochs = range(len(acc))

        plt.plot(epochs, acc, 'r', label='Training accuracy')
        plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
        plt.title('Training and validation accuracy')
        plt.legend()
        plt.figure()

        plt.plot(epochs, loss, 'r', label='Training Loss')
        plt.plot(epochs, val_loss, 'b', label='Validation Loss')
        plt.title('Training and validation loss')
        plt.legend()

        plt.show()
        
    else:
        # Fitting the model 
        start_train_time = time()
        model.fit(X_train, y_train)
        training_time_container[model_name] = round(time() - start_train_time,4)

        # Predicting the Test and Val set results
        start_predict_time = time()
        train_predictions = model.predict(X_train)
        val_predictions = model.predict(X_val)
        test_predictions = model.predict(X_test)
        prediction_time_container[model_name] = round(time() - start_predict_time,4)
    
    train_acc = accuracy_score(y_train, train_predictions)
    accuracy_container[model_name]['train'] = round(train_acc,4)
    
    val_acc = accuracy_score(y_val, val_predictions)
    accuracy_container[model_name]['val'] = round(val_acc,4)
    
    test_acc = accuracy_score(y_test, test_predictions)
    accuracy_container[model_name]['test'] = round(test_acc,4)
    
    f1_train = f1_score(y_train, train_predictions, average='macro')
    f1_macro_container[model_name]['train'] = round(f1_train, 4)

    f1_val = f1_score(y_val, val_predictions, average='macro')
    f1_macro_container[model_name]['val'] = round(f1_val,4)

    f1_test = f1_score(y_test, test_predictions, average='macro')
    f1_macro_container[model_name]['test'] = round(f1_test,4)
    
    print("Training accuracy: " + str(train_acc*100) + "%")
    print("Validation accuracy: " + str(val_acc*100) + "%")
    print("Testing accuracy: " + str(test_acc*100) + "%")
    print("Training F1-macro: ", str(f1_train*100) + "%")
    print("Validation F1-macro: ", str(f1_train*100) + "%")
    print("Testing F1-macro: ", str(f1_test*100) + "%")
    print("Training time: " + str(training_time_container[model_name]) + 's')
    print("Prediction time: " + str(prediction_time_container[model_name]) + 's')

    
    # classification report
    df_report = pd.DataFrame(classification_report(y_test, test_predictions, output_dict=True)).transpose()
    
    # plot the confusion matrix
    if show_cm:
        cm = confusion_matrix(y_test, test_predictions)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                      display_labels=label_encoder.classes_)

        # NOTE: Fill all variables here with default values of the plot_confusion_matrix
        plt.style.use('default')
        fig, ax = plt.subplots(figsize=(10, 10))
        disp = disp.plot(xticks_rotation='vertical', ax=ax, cmap='summer')

        plt.show()
    return df_report

# K-nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# On TF-IDF
train_model(KNeighborsClassifier() , X_data, y_data, X_test, y_test, model_name='knn',show_cm=True)

GridSearch for K nearest Neighbor Algorithm

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import hamming_loss, make_scorer

k_range = list(range(1,15,2))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

grid = GridSearchCV(knn, param_grid, scoring = 'f1_macro', return_train_score=True)
grid.fit(X_data, y_data)

# print(grid.return_train_score)
# print(grid.cv_results_)
results = pd.DataFrame(grid.cv_results_)
print(results)
print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)


In [None]:
score_metric="f1_macro"
best = np.argmax(results.mean_test_score.values)
cv = 5
print("\n------Plotting Cross-Validated Grid Search Results--------\n")
def plot_Xvalidated_grid(results,grid_classifier):
    plt.figure(figsize=(10, 3))
    plt.xlim(-1, len(results))

    for i, (_, row) in enumerate(results.iterrows()):
        scores = row[['split%d_test_score' % i for i in range(cv)]]
        marker_cv, = plt.plot([i] * cv, scores, '^', c='gray', markersize=5,
                                  alpha=.5)
        marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1,
                                    markersize=10, markeredgecolor='k')
        if i == best:
            marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red',
                                        fillstyle="none", alpha=1, markersize=20,
                                        markeredgewidth=3)

    plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x
                            in grid_classifier.cv_results_['params']],rotation=90)
    plt.ylabel("Validation score")
    plt.xlabel("Parameter settings")
    plt.legend([marker_cv, marker_mean, marker_best],
                   ["cv "+score_metric, "mean "+score_metric, "best parameter setting"],
                   loc="best")
    plt.show()

plot_Xvalidated_grid(results,grid)

In [None]:
train_model(grid.best_estimator_, X_data, y_data, X_test, y_test, model_name='knn', show_cm=True)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X_data)
X_data_scaler = scaler.transform(X_data)
X_test_scaler = scaler.transform(X_test)


In [None]:
# On TF-IDF
model = LogisticRegression(max_iter=1000, multi_class='ovr')
train_model(model , X_data_scaler, y_data, X_test_scaler, y_test, model_name='linear_clf',show_cm=True)

GridSearch for Linear Classifier Algorithm

In [None]:
C_options = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]

param_grid = dict(C=C_options)
logistic_reg = LogisticRegression(max_iter=1000 ,penalty='l2', multi_class='ovr')

grid = GridSearchCV(logistic_reg, param_grid, scoring = 'f1_macro', return_train_score=True)
grid.fit(X_data_scaler, y_data)

results = pd.DataFrame(grid.cv_results_)
print(results)
print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)

In [None]:
score_metric="f1_macro"
best = np.argmax(results.mean_test_score.values)
cv = 5
print("\n------Plotting Cross-Validated Grid Search Results--------\n")
def plot_Xvalidated_grid(results,grid_classifier):
    plt.figure(figsize=(10, 3))
    plt.xlim(-1, len(results))

    for i, (_, row) in enumerate(results.iterrows()):
        scores = row[['split%d_test_score' % i for i in range(cv)]]
        marker_cv, = plt.plot([i] * cv, scores, '^', c='gray', markersize=5,
                                  alpha=.5)
        marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1,
                                    markersize=10, markeredgecolor='k')
        if i == best:
            marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red',
                                        fillstyle="none", alpha=1, markersize=20,
                                        markeredgewidth=3)

    plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x
                            in grid_classifier.cv_results_['params']],rotation=90)
    plt.ylabel("Validation score")
    plt.xlabel("Parameter settings")
    plt.legend([marker_cv, marker_mean, marker_best],
                   ["cv "+score_metric, "mean "+score_metric, "best parameter setting"],
                   loc="best")
    plt.show()

plot_Xvalidated_grid(results,grid)

In [None]:
train_model(grid.best_estimator_, X_data_scaler, y_data, X_test_scaler, y_test, model_name='linear_clf', show_cm=True)

# Artificial Neural Network

In [None]:
def create_ann_model(input_shape):
    input_layer = Input(shape=(input_shape,))
    layer = Dense(512, activation='relu')(input_layer)
    layer = Dense(256, activation='relu')(layer)
    layer = Dense(256, activation='relu')(layer)
    output_layer = Dense(20, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return classifier

In [None]:
train_model(create_ann_model(X_data.shape[1]), X_data, y_data.values, X_test, y_test.values, model_name = 'ann', is_neuralnet=True, epochs=3,show_cm=True)

Add Dropout layer

In [None]:
def create_ann_model(input_shape):
    input_layer = Input(shape=(input_shape,))
    layer = Dense(512, activation='relu')(input_layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(256, activation='relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(256, activation='relu')(layer)
    output_layer = Dense(20, activation='softmax')(layer)
    
    classifier = models.Model(input_layer, output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return classifier

In [None]:
train_model(create_ann_model(X_data.shape[1]), X_data, y_data.values, X_test, y_test.values, model_name = 'ann', is_neuralnet=True, epochs=7,show_cm=True)

# Result

In [None]:
def convert_result(training_time_container, prediction_time_container, accuracy_container):    
    df_result = pd.DataFrame(columns=["Model name", "Train Score", "Validation Score", "Test Score","F1 macro train", "F1 macro val", "F1 macro test", "Training Time", "Prediction Time"])
    for model in training_time_container:
        df_result = df_result.append({
            "Model name": model,
            "Train Score": accuracy_container[model]['train']*100,
            "Validation Score": accuracy_container[model]['val']*100,
            "Test Score": accuracy_container[model]['test']*100,
            "F1 macro train": f1_macro_container[model]['train']*100,
            "F1 macro val": f1_macro_container[model]['val']*100,
            "F1 macro test": f1_macro_container[model]['test']*100,
            "Training Time": str(training_time_container[model]) + 's',
            "Prediction Time": str(prediction_time_container[model]) + 's'
        }, ignore_index=True)
    return df_result


: 

In [None]:
convert_result(training_time_container, prediction_time_container, accuracy_container)

In [None]:
import plotly.graph_objects as go

In [None]:
data = list(accuracy_container.values())
fig=go.Figure(data=[go.Bar(name = 'train', x =list(accuracy_container.keys()), y = [data[i]['train'] for i in range(len(data))], text =[data[i]['train'] for i in range(len(data))], textposition='auto' ),
                    go.Bar(name = 'val', x =list(accuracy_container.keys()), y = [data[i]['val'] for i in range(len(data))], text =[data[i]['val'] for i in range(len(data))], textposition='auto'),
                    go.Bar(name = 'test', x =list(accuracy_container.keys()), y = [data[i]['test'] for i in range(len(data))], text =[data[i]['test'] for i in range(len(data))], textposition='auto')])
                    

fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)',
                  title="Comparison of Accuracy Scores of different classifiers",
                    xaxis_title="Machine Learning Models",
                    yaxis_title="Accuracy Scores" ,
                 barmode = 'group')

fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "white"  
fig

In [None]:
data = list(f1_macro_container.values())
fig=go.Figure(data=[go.Bar(name = 'train', x =list(f1_macro_container.keys()), y = [data[i]['train'] for i in range(len(data))], text =[data[i]['train'] for i in range(len(data))], textposition='auto'),
                    go.Bar(name = 'val', x =list(f1_macro_container.keys()), y = [data[i]['val'] for i in range(len(data))], text =[data[i]['val'] for i in range(len(data))], textposition='auto'),
                    go.Bar(name = 'test', x =list(f1_macro_container.keys()), y = [data[i]['test'] for i in range(len(data))], text =[data[i]['test'] for i in range(len(data))], textposition='auto')])
                    

fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)',
                  title="Comparison of Marco-Average F1-score of different classifiers",
                  xaxis_title="Machine Learning Models",
                  yaxis_title="Macro-F1 Scores" ,
                  barmode = 'group')

fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "white"  
fig

In [None]:
fig=go.Figure(data=[go.Bar(y=list(training_time_container.values()),x=list(training_time_container.keys()),
                           marker={'color':np.arange(len(list(training_time_container.values())))}
                          ,text=list(training_time_container.values()), textposition='auto' )])

fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)',
                  title="Comparison of Training Time of different classifiers",
                    xaxis_title="Machine Learning Models",
                    yaxis_title="Training time in seconds" )

fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black"  
fig

In [None]:
fig=go.Figure(data=[go.Bar(y=list(prediction_time_container.values()),x=list(prediction_time_container.keys()),
                           marker={'color':np.arange(len(list(prediction_time_container.values())))}
                          ,text=list(prediction_time_container.values()), textposition='auto' )])

fig.update_layout(autosize=True ,plot_bgcolor='rgb(275, 275, 275)',
                  title="Comparison of Prediction Time of different classifiers",
                    xaxis_title="Machine Learning Models",
                    yaxis_title="Prediction time in seconds")

fig.data[0].marker.line.width = 3
fig.data[0].marker.line.color = "black"  
fig