<a href="https://colab.research.google.com/github/modinho22/cyberbullying-detection/blob/main/notebook1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'suspicious-communication-on-social-platforms:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1382245%2F2293133%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240731%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240731T185208Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1243d022fc4dbab8a8c369fdf6980d3bce2694be3f04f9d03e31b8944fc5f5446e49aa294d20ba4fc0d9b5d48445e6305197673c71ac21eb0e572138147f366ea72af45a89222d24b2849d565d5ca9c966849af58fc247f266972f5579b6d7b08d31831f1cf3092c5943caef4c1b6b3d0de560dc180307c8493eb726f5a9e3add7eca8583fcc2310b5b3f6c3d6dd5c15ed6702c60d61db1d0d828730047d2a4aff94128f6936f139db8783dab1cc169675d61f3986b6fe07280aec10f15fcdd1dc113bcfbb09e709dd64978c3d47e829bd072deeea4bad35e6b5ccdd35f2548b2c635118e84f8f09eee0c5a03bfb2d9e013d71087f3f3f21056d44864f2cde87'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#reading the csv file as a link to github
url = '/kaggle/input/suspicious-communication-on-social-platforms/Suspicious Communication on Social Platforms.csv'
df = pd.read_csv(url)

#looking at the first five rows using .head() function
df.head()

#importing natural language Toolkit - A tool to preprocesses/clean text
import nltk

In [None]:
#Checking that the latest nltk version is installed in the users local computer
#Warning: install all the libraries required for this project
nltk.__version__

# Preprocessing

In [None]:
df.shape

In [None]:
#converting all missing values in tagging column into NaN
#example: converting None,NotAvailable,ValueMissing,etc into Nan
df=df[pd.to_numeric(df['tagging'], errors='coerce').notnull()]

In [None]:
#Droping all rows that contain Nan value
df = df.dropna(axis=0)

In [None]:
#Since the shape is the same there are no missing valus in our dataset
df.shape

In [None]:
import string
from nltk import pos_tag#pos_tag is a tool that tags the part of speech to the word(POS = Part of Speech)
#example: tagging the word 'drinking' as verb

#function for removing punctuations
def tokenize_remove_punctuation(text):
  clean_text = []         #creaating an empty list to store the cleaned text
  text = text.split(" ")  #spliting all words in a sentence separated by " " and storing them in a list named 'text'
  for word in text:
    word = list(word)  #spliting all words into alphabets
    new_word = []      #creaating an empty list to store the new word after removing puntuations

    # spliting the words into alphabets is used because it will convert words like 'reading?' into 'reading'
    for c in word:
      if c not in string.punctuation:     #string.puntuation is a list og all puntuation marks , example :@!$%&?, etc.
        new_word.append(c)
      word = "".join(new_word)  #joing the alphabets to create the word after removing all puntuations
    clean_text.append(word)     #storing the word in the list named 'clean_text' to create the list of words in the sentence
  return clean_text


In [None]:
# using a sample sentence to see whether the funtion works well or not
# NOTE: We have only created the functions till now. We haven't done anythong with our dataset till now.
trial_text = tokenize_remove_punctuation("hello @anyone reading? wt is the name of am in that this  ??!@")
trial_text

In [None]:
import nltk
#downloads the list of stopwords
nltk.download('stopwords')

#'stopwords' is a list of words that have nearly no value in the sentence
#example : 'I am a boy' is converted into 'boy'
#here words like 'I', 'am', 'a' ;these words have very less comtribution to the sentence

In [None]:
#storing all the stopwords in the list named 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')  #storing only english stopwords , there are stopwords for other language also such as chinese and french

# Function to remove all the stopwords from the sentence
def remove_stopwords(text):
  clean_text = []
  for word in text:
    if word not in stopwords:
      clean_text.append(word)
  return clean_text

In [None]:
# using a sample sentence to see whether the funtion works well or not
remove_stopwords(trial_text)

In [None]:
#tagging all the words according o their part of speech
def pos_tagging(text):
    try:
        tagged = nltk.pos_tag(text)
        return tagged
    except Excepton as e:
        print(e)

In [None]:
from nltk.corpus import wordnet

#wordnet is a tool that reads that reads the tagging and returns the part of speech
def get_wordnet(pos_tag):
  if pos_tag.startswith('J'):
    return wordnet.ADJ
  elif pos_tag.startswith('V'):
    return wordnet.VERB
  elif pos_tag.startswith('N'):
    return wordnet.NOUN
  elif pos_tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

In [None]:
from nltk.stem import WordNetLemmatizer
#WordLemmatizer is a tool that converts word into root word
#Example: historical(word) is converted into history(root-word)

#Now we will create a function that uses all the functions that we have created above

def clean_text(text):
  text = str(text)
  #Converting text to lower-case
  text = text.lower()
  #tokenize and remove punctuations from the text
  text = tokenize_remove_punctuation(text)
  #remove words containing numericals
  text = [word for word in text if not any(c.isdigit() for c in word)]
  #remove stopwords
  text = remove_stopwords(text)
  #remove empty tokens
  text = [ t for t in text if len(t) > 0]
  #pos tagging
  pos_tags = pos_tagging(text)
  #Lemmatize text
  text = [WordNetLemmatizer().lemmatize(t[0],get_wordnet(t[1])) for t in pos_tags]
  #remove words with only one letter
  text = [ t for t in text if len(t)>1]
  #join all words
  text = " ".join(text)
  return text

In [None]:
#don't know what is 'averaged_perceptron_tagger'
#don't know why devansh downloaded it
nltk.download('averaged_perceptron_tagger')

#Downloading the wordnet tool
nltk.download('wordnet')

In [None]:

# using a sample sentence to see whether the funtion works well or not
clean_text("What is y0ur name? THis is a cat!! 12?")

In [None]:
df['tagging'].value_counts(normalize=True).plot(kind='bar', title='Ratio of observations')
#ploting number of '1' and '0' in the 'tagging' colomn of the dataset

#As we can see the difference in number of '1' and '0' is not very large hence we call it as a balanced dataset
#NOTE:It's very important to provide a balanced dataset for creating the model

In [None]:
#This line code converts floating numericals into integer numeric
#example: converting 1.0, 1.00, etc into integer 1
#example: converting 0.0, 0.00, etc into integer 0

df['tagging']=df['tagging'].astype(str).astype(int)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
#reseting the index of rows
# Index gets unordered if we drop some rows in our dataset(Example while using dropna() function)
df.reset_index(inplace = True, drop = True)

In [None]:
#this line of code will now be used to pply the functions on each sentences in the 'comments' column
#This will take time as it will use the 'clean_text' function on all the sentences in our dataset
#the .map() function applies the function at each sentences in the 'comments' column
df['Processed_Comment'] = df['comments'].map(clean_text)

In [None]:
#Splitting dataset for training and testing(80:20)

from sklearn.model_selection import train_test_split
#train_test_split is a funtion that splits dataset into two parts
#80%(for training the model) and 20%(for testing the model)
#This function returns 4 values
# 1 'Processed_comment' for training
# 2 'Processed_comment' for testing
# 3 'tagging' for training
# 3 'tagging' for testing
X_train, X_test, y_train, y_test = train_test_split(df['Processed_Comment'],
                                                    df['tagging'],
                                                    random_state=42,test_size=0.20)

#random state is used to shuffle the dataset
#test_size=0.20 means that 20% of the dataset is to be allocated for testing of the model

In [None]:
#Creating a bag of words from training data
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vector = CountVectorizer()
X_train = count_vector.fit_transform(X_train)
X_test = count_vector.transform(X_test)

#vectorizing means giving value to the words in the sentence according to a formula
#This value tells us how much the word contributes in the sentence to be a cyberbulling comment
#I guess this returns a table

In [None]:
#gives the number of columns in the vectorized table
len(count_vector.vocabulary_)

In [None]:
#Importing all the terms by which we get accuracy of our model
#NOTE:we haven't trained our model yet
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
#creates a confusion matrix
from sklearn.metrics import confusion_matrix
data =[] #Empty list created to add the accuracy terms of each model that we are going to train

In [None]:
# Importing all the necessary models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

# Load sample data
data = load_iris()
X = data.data
y = data.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a dictionary of all the possible parameters of each ML model
model_params = {
    'LinearSVC': {
        'model': LinearSVC(max_iter=1000000, random_state=42),
        'params' : {
            'C': [0.1, 1, 5, 10, 20],
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params' : {
            'alpha': np.linspace(0.5, 1.5, 6),
            'fit_prior': [True, False],
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000000),
        'params': {
            'C': [1, 5, 10],
            'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params' : {
            'n_neighbors': [5, 9, 11, 23],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski'],
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(random_state=42),
        'params' : {
            'n_estimators': [10, 50, 100],
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(random_state=42),
        'params' : {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
        }
    },
    'AdaBoostClassifier': {
        'model': AdaBoostClassifier(random_state=42),
        'params' : {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(random_state=42),
        'params' : {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(random_state=42, max_iter=2000),
        'params' : {
            'hidden_layer_sizes': [(50, 50), (100,)],
            'activation': ['tanh', 'relu'],
            'solver': ['sgd', 'adam'],
            'alpha': [0.0001, 0.001],
            'learning_rate_init': [0.001, 0.01, 0.1]
        }
    },
    'ExtraTreesClassifier': {
        'model': ExtraTreesClassifier(random_state=42),
        'params' : {
            'n_estimators': [10, 50, 100],
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],
        }
    }
}

# Creating a list to store the best parameters
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    # storing the values in 'scores' list
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

# Creating a table of the best parameters
df4 = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
print(df4)


In [None]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

# Creating a dictionary of all the possible parameters of each ML model
model_params = {
    'LinearSVC': {
        'model': LinearSVC(max_iter=1000000, random_state=42),
        'params' : {
            'C': [0.1, 1, 5, 10, 20],
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params' : {
            'alpha': np.linspace(0.5, 1.5, 6),
            'fit_prior': [True, False],
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000000),
        'params': {
            'C': [1, 5, 10],
            'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params' : {
            'n_neighbors': [5, 9, 11, 23],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski'],
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(random_state=42),
        'params' : {
            'n_estimators': [10, 50, 100],
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],
        }
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(random_state=42),
        'params' : {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
        }
    },
    'AdaBoostClassifier': {
        'model': AdaBoostClassifier(random_state=42),
        'params' : {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1],
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(random_state=42),
        'params' : {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(random_state=42, max_iter=2000),
        'params' : {
            'hidden_layer_sizes': [(50, 50), (100,)],
            'activation': ['tanh', 'relu'],
            'solver': ['sgd', 'adam'],
            'alpha': [0.0001, 0.001],
            'learning_rate_init': [0.001, 0.01, 0.1]
        }
    },
    'ExtraTreesClassifier': {
        'model': ExtraTreesClassifier(random_state=42),
        'params' : {
            'n_estimators': [10, 50, 100],
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],
        }
    }
}

# Creating a list to store the best parameters and their evaluation scores
scores = []
data = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)

    # Best estimator from GridSearchCV
    best_model = clf.best_estimator_

    # Predictions for training and testing datasets
    predictions_train = best_model.predict(X_train)
    predictions_test = best_model.predict(X_test)

    # Storing the best parameters and scores
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

    # Creating a temp list to store the accuracy terms
    temp = [model_name]
    temp.append(accuracy_score(predictions_train, y_train))  # accuracy_score for training data
    temp.append(recall_score(predictions_train, y_train, average='macro'))    # recall_score for training data
    temp.append(f1_score(predictions_train, y_train, average='macro'))        # f1_score for training data
    temp.append(precision_score(predictions_train, y_train, average='macro')) # precision_score for training data
    temp.append(accuracy_score(predictions_test, y_test))    # accuracy_score for testing data
    temp.append(recall_score(predictions_test, y_test, average='macro'))      # recall_score for testing data
    temp.append(f1_score(predictions_test, y_test, average='macro'))          # f1_score for testing data
    temp.append(precision_score(predictions_test, y_test, average='macro'))   # precision_score for testing data

    # Storing all the accuracy terms in 'data' list
    data.append(temp)

# Creating a DataFrame for best parameters and their evaluation scores
df4 = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
print(df4)

# Creating a DataFrame for accuracy terms
columns = ['Model', 'Train Accuracy', 'Train Recall', 'Train F1', 'Train Precision', 'Test Accuracy', 'Test Recall', 'Test F1', 'Test Precision']
df5 = pd.DataFrame(data, columns=columns)
print(df5)


# Summary

In [None]:
#Creating a table of all accuracy terms of each trained ML models
result = pd.DataFrame(data, columns = ['Algorithm','Accuracy Score : Train', 'Recall Score : Train','F1-Score :Train','Precision Score :Train','Accuracy Score : Test', 'Recall Score : Test','F1-Score : Test','Precision Score : Test'])
result.reset_index(drop=True, inplace=True)

In [None]:
result

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extracting data for plotting
algorithms = result['Algorithm']
train_accuracy = result['Accuracy Score : Train']
test_accuracy = result['Accuracy Score : Test']
train_recall = result['Recall Score : Train']
test_recall = result['Recall Score : Test']
train_f1 = result['F1-Score :Train']
test_f1 = result['F1-Score : Test']
train_precision = result['Precision Score :Train']
test_precision = result['Precision Score : Test']

# Setting up the figure and axes
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
plt.subplots_adjust(hspace=0.4)

# Defining color palettes
train_color = 'skyblue'
test_color = 'orange'

# Plotting Accuracy Scores
sns.barplot(x=algorithms, y=train_accuracy, ax=axes[0, 0], color=train_color, label='Train Accuracy')
sns.barplot(x=algorithms, y=test_accuracy, ax=axes[0, 0], color=test_color, label='Test Accuracy')
axes[0, 0].set_title('Accuracy Scores')
axes[0, 0].set_ylabel('Score')
axes[0, 0].legend()

# Plotting Recall Scores
sns.barplot(x=algorithms, y=train_recall, ax=axes[0, 1], color=train_color, label='Train Recall')
sns.barplot(x=algorithms, y=test_recall, ax=axes[0, 1], color=test_color, label='Test Recall')
axes[0, 1].set_title('Recall Scores')
axes[0, 1].set_ylabel('Score')
axes[0, 1].legend()

# Plotting F1-Scores
sns.barplot(x=algorithms, y=train_f1, ax=axes[1, 0], color=train_color, label='Train F1-Score')
sns.barplot(x=algorithms, y=test_f1, ax=axes[1, 0], color=test_color, label='Test F1-Score')
axes[1, 0].set_title('F1-Scores')
axes[1, 0].set_ylabel('Score')
axes[1, 0].legend()

# Plotting Precision Scores
sns.barplot(x=algorithms, y=train_precision, ax=axes[1, 1], color=train_color, label='Train Precision')
sns.barplot(x=algorithms, y=test_precision, ax=axes[1, 1], color=test_color, label='Test Precision')
axes[1, 1].set_title('Precision Scores')
axes[1, 1].set_ylabel('Score')
axes[1, 1].legend()

# Displaying the plot
plt.tight_layout()
plt.show()


# Saving models in .pkl file

In [None]:
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Define all the models
clfs = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "Naive Bayes": MultinomialNB(),
    "Multilayer Perceptron": MLPClassifier(random_state=42, max_iter=1000),
    "KNeighborsClassifier": KNeighborsClassifier()
}

# Assuming you have the training and testing data
# X_train, X_test, y_train, y_test

# List to store the accuracy of each model
accuracies = []

# Train each model and save it to a .pkl file
for name, clf in clfs.items():
    # Train the model
    clf.fit(X_train, y_train)

    # Save the trained model to a .pkl file
    with open(f'{name}.pkl', 'wb') as file:
        pickle.dump(clf, file)

    # Calculate accuracy
    accuracy = clf.score(X_test, y_test)
    accuracies.append((name, accuracy))

# Print the accuracy of the models
for name, accuracy in accuracies:
    print(f'Model: {name}, Accuracy: {accuracy:.4f}')
