# YouTube Spam stack model

In [6]:
import pandas as pd
import zipfile
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report


In [7]:
z= zipfile.ZipFile("youtube+spam+collection (1).zip")
Psy = pd.read_csv(z.open("Youtube01-Psy.csv"))
Katy = pd.read_csv(z.open("Youtube02-KatyPerry.csv"))
LMFAO = pd.read_csv(z.open("Youtube03-LMFAO.csv"))
Eminem = pd.read_csv(z.open("Youtube04-Eminem.csv"))
Shakira = pd.read_csv(z.open("Youtube05-Shakira.csv"))


In [8]:
data = pd.concat([Psy, Katy, LMFAO, Eminem, Shakira])
data.drop(["COMMENT_ID", "AUTHOR", "DATE"], axis=1, inplace=True)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(data["CONTENT"], data["CLASS"])


In [10]:
tfidf_vect = TfidfVectorizer(use_idf=True, lowercase= True)
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_train_tfidf.shape

(1467, 3746)

In [11]:
def save_tfidf_vocabulary(tfidf_vectorizer, filename):
    with open(filename, 'wb') as vocab_file:
        pickle.dump(tfidf_vectorizer.vocabulary_, vocab_file)

# Save the TF-IDF vocabulary to a file
save_tfidf_vocabulary(tfidf_vect, 'tfidf_vocabulary.pkl')


In [12]:
import joblib

# Save the TF-IDF vectorizer to a pickle file
joblib.dump(tfidf_vect, 'tfidf_vectorizer.pkl')

# Verify that the vectorizer has been saved
print("TF-IDF vectorizer saved to tfidf_vectorizer.pkl")


TF-IDF vectorizer saved to tfidf_vectorizer.pkl


# Classification models

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score


# Decision tree

In [14]:
from sklearn.tree import DecisionTreeClassifier

# Create and train the Decision Tree classifier
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train_tfidf, y_train)

# Make predictions
y_train_pred = dt.predict(X_train_tfidf)
y_test_pred = dt.predict(tfidf_vect.transform(X_test))

# Training set performance
dt_train_accuracy = accuracy_score(y_train, y_train_pred)
dt_train_mcc = matthews_corrcoef(y_train, y_train_pred)
dt_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# Test set performance
dt_test_accuracy = accuracy_score(y_test, y_test_pred)
dt_test_mcc = matthews_corrcoef(y_test, y_test_pred)
dt_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('Model performance for Training set')
print('- Accuracy: %s' % dt_train_accuracy)
print('- MCC: %s' % dt_train_mcc)
print('- F1 score: %s' % dt_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % dt_test_accuracy)
print('- MCC: %s' % dt_test_mcc)
print('- F1 score: %s' % dt_test_f1)

Model performance for Training set
- Accuracy: 0.9120654396728016
- MCC: 0.837202504494492
- F1 score: 0.9115440939602026
----------------------------------
Model performance for Test set
- Accuracy: 0.8936605316973415
- MCC: 0.7992257112749366
- F1 score: 0.8934291190139685


# Random forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train_tfidf, y_train)

# Make predictions
y_train_pred = rf.predict(X_train_tfidf)
y_test_pred = rf.predict(tfidf_vect.transform(X_test))

# Training set performance
rf_train_accuracy = accuracy_score(y_train, y_train_pred)
rf_train_mcc = matthews_corrcoef(y_train, y_train_pred)
rf_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# Test set performance
rf_test_accuracy = accuracy_score(y_test, y_test_pred)
rf_test_mcc = matthews_corrcoef(y_test, y_test_pred)
rf_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- MCC: %s' % rf_train_mcc)
print('- F1 score: %s' % rf_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- MCC: %s' % rf_test_mcc)
print('- F1 score: %s' % rf_test_f1)

Model performance for Training set
- Accuracy: 0.9938650306748467
- MCC: 0.9878030529094353
- F1 score: 0.9938654925584773
----------------------------------
Model performance for Test set
- Accuracy: 0.9406952965235174
- MCC: 0.8848498924374916
- F1 score: 0.9407265506491967


# Neural network

In [16]:
from sklearn.neural_network import MLPClassifier

# Create and train the MLP classifier
mlp = MLPClassifier(alpha=1, max_iter=1000)
mlp.fit(X_train_tfidf, y_train)

# Make predictions
y_train_pred = mlp.predict(X_train_tfidf)
y_test_pred = mlp.predict(tfidf_vect.transform(X_test))

# Training set performance
mlp_train_accuracy = accuracy_score(y_train, y_train_pred)
mlp_train_mcc = matthews_corrcoef(y_train, y_train_pred)
mlp_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# Test set performance
mlp_test_accuracy = accuracy_score(y_test, y_test_pred)
mlp_test_mcc = matthews_corrcoef(y_test, y_test_pred)
mlp_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('Model performance for Training set')
print('- Accuracy: %s' % mlp_train_accuracy)
print('- MCC: %s' % mlp_train_mcc)
print('- F1 score: %s' % mlp_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % mlp_test_accuracy)
print('- MCC: %s' % mlp_test_mcc)
print('- F1 score: %s' % mlp_test_f1)

Model performance for Training set
- Accuracy: 0.9918200408997955
- MCC: 0.9836699249289926
- F1 score: 0.9918205199162735
----------------------------------
Model performance for Test set
- Accuracy: 0.934560327198364
- MCC: 0.8694245748054071
- F1 score: 0.9346019875697238


# Support vector machine (Radial basis function kernel)

In [17]:
from sklearn.svm import SVC

# Create and train the SVM classifier with RBF kernel
svm_rbf = SVC(gamma=2, C=1)
svm_rbf.fit(X_train_tfidf, y_train)

# Make predictions
y_train_pred = svm_rbf.predict(X_train_tfidf)
y_test_pred = svm_rbf.predict(tfidf_vect.transform(X_test))

# Training set performance
svm_rbf_train_accuracy = accuracy_score(y_train, y_train_pred)
svm_rbf_train_mcc = matthews_corrcoef(y_train, y_train_pred)
svm_rbf_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# Test set performance
svm_rbf_test_accuracy = accuracy_score(y_test, y_test_pred)
svm_rbf_test_mcc = matthews_corrcoef(y_test, y_test_pred)
svm_rbf_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('Model performance for Training set')
print('- Accuracy: %s' % svm_rbf_train_accuracy)
print('- MCC: %s' % svm_rbf_train_mcc)
print('- F1 score: %s' % svm_rbf_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % svm_rbf_test_accuracy)
print('- MCC: %s' % svm_rbf_test_mcc)
print('- F1 score: %s' % svm_rbf_test_f1)

Model performance for Training set
- Accuracy: 0.9986366734832992
- MCC: 0.9972759961766186
- F1 score: 0.9986366367265247
----------------------------------
Model performance for Test set
- Accuracy: 0.9243353783231084
- MCC: 0.8520646375719324
- F1 score: 0.9243752542765614


# Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score

# Create and train the Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)

# Make predictions
y_train_pred = naive_bayes.predict(X_train_tfidf)
y_test_pred = naive_bayes.predict(tfidf_vect.transform(X_test))

# Training set performance
naive_bayes_train_accuracy = accuracy_score(y_train, y_train_pred)
naive_bayes_train_mcc = matthews_corrcoef(y_train, y_train_pred)
naive_bayes_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# Test set performance
naive_bayes_test_accuracy = accuracy_score(y_test, y_test_pred)
naive_bayes_test_mcc = matthews_corrcoef(y_test, y_test_pred)
naive_bayes_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('Model performance for Training set')
print('- Accuracy: %s' % naive_bayes_train_accuracy)
print('- MCC: %s' % naive_bayes_train_mcc)
print('- F1 score: %s' % naive_bayes_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % naive_bayes_test_accuracy)
print('- MCC: %s' % naive_bayes_test_mcc)
print('- F1 score: %s' % naive_bayes_test_f1)



Model performance for Training set
- Accuracy: 0.9706884798909339
- MCC: 0.9414860284894111
- F1 score: 0.9706813922050835
----------------------------------
Model performance for Test set
- Accuracy: 0.9100204498977505
- MCC: 0.8210115644271592
- F1 score: 0.9097130493697667


# Build Stacked model

In [19]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define a list of estimators
estimator_list = [
    ('svm_rbf', svm_rbf),
    ('dt', dt),
    ('rf', rf),
    ('mlp', mlp),
    ('naive_bayes', naive_bayes)
]

# Build the stacked model with a final Logistic Regression estimator
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train the stacked model
stack_model.fit(X_train_tfidf, y_train)

# Make predictions
y_train_pred = stack_model.predict(X_train_tfidf)
y_test_pred = stack_model.predict(tfidf_vect.transform(X_test))

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred)
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred)
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred)
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred)
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)

Model performance for Training set
- Accuracy: 0.9993183367416496
- MCC: 0.998637106019357
- F1 score: 0.9993183278695598
----------------------------------
Model performance for Test set
- Accuracy: 0.9468302658486708
- MCC: 0.893335346823719
- F1 score: 0.9468302658486708


In [20]:
# Add Naive Bayes metrics to the dictionaries
acc_train_list = {
    'svm_rbf': svm_rbf_train_accuracy,
    'dt': dt_train_accuracy,
    'rf': rf_train_accuracy,
    'mlp': mlp_train_accuracy,
    'naive_bayes': naive_bayes_train_accuracy,  # Add Naive Bayes accuracy
    'stack': stack_model_train_accuracy
}

mcc_train_list = {
    'svm_rbf': svm_rbf_train_mcc,
    'dt': dt_train_mcc,
    'rf': rf_train_mcc,
    'mlp': mlp_train_mcc,
    'naive_bayes': naive_bayes_train_mcc,  # Add Naive Bayes MCC
    'stack': stack_model_train_mcc
}

f1_train_list = {
    'svm_rbf': svm_rbf_train_f1,
    'dt': dt_train_f1,
    'rf': rf_train_f1,
    'mlp': mlp_train_f1,
    'naive_bayes': naive_bayes_train_f1,  # Add Naive Bayes F1 score
    'stack': stack_model_train_f1
}


In [21]:
mcc_train_list

{'svm_rbf': 0.9972759961766186,
 'dt': 0.837202504494492,
 'rf': 0.9878030529094353,
 'mlp': 0.9836699249289926,
 'naive_bayes': 0.9414860284894111,
 'stack': 0.998637106019357}

In [22]:
acc_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_train_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1'])

# Concatenate the DataFrames into a single DataFrame
df = pd.concat([acc_df, mcc_df, f1_df], axis=1)

# Display the resulting DataFrame
print(df)

             Accuracy       MCC        F1
svm_rbf      0.998637  0.997276  0.998637
dt           0.912065  0.837203  0.911544
rf           0.993865  0.987803  0.993865
mlp          0.991820  0.983670  0.991821
naive_bayes  0.970688  0.941486  0.970681
stack        0.999318  0.998637  0.999318


In [23]:
import joblib

# Save the stacking model to a pickle file
joblib.dump(stack_model, 'stacked_model.pkl')

# Verify that the model has been saved
print("Stacking model saved to stacked_model.pkl")


Stacking model saved to stacked_model.pkl


In [24]:
import matplotlib.pyplot as plt

# Increase the size of the plots
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 15))

# Plot Accuracy
axes[0].bar(df.index, df['Accuracy'], color='skyblue')
axes[0].set_xlabel('Classifier')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Accuracy Comparison')
axes[0].tick_params(axis='x', rotation=45)

# Plot MCC
axes[1].bar(df.index, df['MCC'], color='lightgreen')
axes[1].set_xlabel('Classifier')
axes[1].set_ylabel('MCC')
axes[1].set_title('MCC Comparison')
axes[1].tick_params(axis='x', rotation=45)

# Plot F1-score
axes[2].bar(df.index, df['F1'], color='lightcoral')
axes[2].set_xlabel('Classifier')
axes[2].set_ylabel('F1-score')
axes[2].set_title('F1-score Comparison')
axes[2].tick_params(axis='x', rotation=45)

# Adjust spacing between plots
plt.tight_layout()

# Show the plots
plt.show()




ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Load the model from the pickle file
loaded_model = joblib.load('stacked_model.pkl')

# The comment you want to make a prediction for
comment = ["subscribe here"]

# Transform the comment into a TF-IDF vector
comment_tfidf = tfidf_vect.transform(comment)

# Use the loaded model to make predictions
predictions = loaded_model.predict(comment_tfidf)

# Print the predictions
print(predictions)


[1]


In [None]:
import joblib

# Save the stacking model to a pickle file
joblib.dump(tfidf_vect, 'tfidf_vect.pkl')

# Verify that the model has been saved
print("tfidf_vect saved to tfidf_vect.pkl")

tfidf_vect saved to tfidf_vect.pkl
