In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('kaggle dataset.csv', encoding='latin1')

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.index = df.index + 1
df.head()

In [None]:
unnecessary_columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"]
df.drop(columns = unnecessary_columns, inplace = True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
readable_columns = {"v1":"Label","v2":"Content"}
df.rename(columns = readable_columns,inplace = True)

In [None]:
df.info()

In [None]:
df.sample(5)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
# Preprocessing: Convert labels to binary (0 for ham, 1 for spam)
df['Label'] = df['Label'].apply(lambda x: 1 if x == 'spam' else 0)

In [None]:
df.sample(5)

In [None]:
df.head()

In [None]:
spam_count = (df['Label'] == 1).sum()
ham_count = (df['Label'] == 0).sum()

print(f"Total Spam Count: {spam_count}")
print(f"Total Ham Count: {ham_count}")

In [None]:
df['Label'].value_counts()

In [None]:
import matplotlib.pyplot as plt

In [None]:
#autopct="%0.2f" shows %
colors = ['#00ff00', '#ff0000']
plt.pie(df['Label'].value_counts(), labels=['HAM', 'SPAM'], autopct="%0.2f", colors=colors)
plt.show()

In [None]:
#data is not balanced it leans towards ham

In [None]:
import nltk


In [None]:
!pip install nltk

In [None]:
nltk.download('punkt')

In [None]:
df['Sentence Count'] = df['Content'].apply(lambda x:len(nltk.sent_tokenize(x)))
df['Word Count'] = df['Content'].apply(lambda x:len(nltk.word_tokenize(x)))
df['Character Count'] = df['Content'].apply(len)

In [None]:
df.info()

In [None]:
df.sample(5)

In [None]:
df[df['Label'] == 0][['Sentence Count','Word Count','Character Count']].describe()

In [None]:
df[df['Label'] == 1][['Sentence Count','Word Count','Character Count']].describe()

In [None]:
 df[['Sentence Count','Word Count','Character Count']].describe()

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(df[df['Label'] == 0]['Sentence Count'],color='#00ff00')
sns.histplot(df[df['Label'] == 1]['Sentence Count'],color='#ff0000')

In [None]:
plt.figure(figsize=(12,4))
sns.histplot(df[df['Label'] == 0]['Word Count'],color='#00ff00')
sns.histplot(df[df['Label'] == 1]['Word Count'],color='#ff0000')

In [None]:
plt.figure(figsize=(10,4))
sns.histplot(df[df['Label'] == 0]['Character Count'],color='#00ff00')
sns.histplot(df[df['Label'] == 1]['Character Count'],color='#ff0000')

In [None]:
sns.pairplot(df,hue='Label')

In [None]:
#autopct="%0.2f" shows %
colors = ['#00ff00', '#ff0000']
plt.pie(df['Label'].value_counts(), labels=['HAM', 'SPAM'], autopct="%0.2f", colors=colors)
plt.show()

In [None]:
# Separate the DataFrame into two groups based on the 'spam' column
df_0 = df[df['Label'] == 0] # The group with spam value 0
df_1 = df[df['Label'] == 1] # The group with spam value 1

# Find the number of rows in the smaller group
n = min(len(df_0), len(df_1))

# Sample n rows from the larger group without replacement
df_0_balanced = df_0.sample(n, replace=False)

# Concatenate the balanced group with the smaller group
df_balanced = pd.concat([df_0_balanced, df_1])
df = df_balanced
# Print the balanced DataFrame
print(df_balanced)

In [None]:
df['Label'].value_counts()

In [None]:
#autopct="%0.2f" shows %
colors = ['#00ff00', '#ff0000']
plt.pie(df['Label'].value_counts(), labels=['HAM', 'SPAM'], autopct="%0.2f", colors=colors)
plt.show()

In [None]:
#3

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')


In [None]:
import string

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [None]:
#Test
df['Content'][69]

In [None]:
transform_text(df['Content'][69])

In [None]:
df['Revised Content'] = df['Content'].apply(transform_text)

In [None]:
df.head(5)

In [None]:
!pip3 install wordcloud

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(df[df['Label'] == 1]['Revised Content'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

In [None]:
from collections import Counter

In [None]:
# Assuming 'df' is your DataFrame with 'transformed_text' column and 'target' column
spam_text = df[df['Label'] == 1]['Revised Content'].str.cat(sep=" ")

# Create a Counter object to count word frequencies
word_counts = Counter(spam_text.split())

# Get the most common words and their frequencies
top_words = word_counts.most_common(10)  # You can adjust the number based on your preference

# Extract words and frequencies for plotting
words, frequencies = zip(*top_words)

# Create a bar plot
plt.figure(figsize=(15, 6))
plt.bar(words, frequencies, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Words in Spam Text')
plt.xticks(rotation=45)
plt.show()

In [None]:
ham_wc = wc.generate(df[df['Label'] == 0]['Revised Content'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)

In [None]:
# Assuming 'df' is your DataFrame with 'transformed_text' column and 'target' column
ham_text = df[df['Label'] == 0]['Revised Content'].str.cat(sep=" ")

# Create a Counter object to count word frequencies
word_counts = Counter(ham_text.split())

# Get the most common words and their frequencies
top_words = word_counts.most_common(10)  # You can adjust the number based on your preference

# Extract words and frequencies for plotting
words, frequencies = zip(*top_words)

# Create a bar plot
plt.figure(figsize=(15, 6))
plt.bar(words, frequencies, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Words in Spam Text')
plt.xticks(rotation=45)
plt.show()

In [None]:
#4

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [None]:
X = tfidf.fit_transform(df['Revised Content']).toarray()

In [None]:
X.shape

In [None]:
y = df['Label'].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score, classification_report

In [None]:
dt = DecisionTreeClassifier(max_depth=5)
knn = KNeighborsClassifier()
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)
cm_dt = confusion_matrix(y_test,y_pred_dt)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Print accuracy, precision, and confusion matrix for Decision Trees
print("Decision Trees:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Confusion Matrix:")
print(cm_dt)

# Generate classification report for Decison Trees
report_dt = classification_report(y_test, y_pred_dt)
print("\nClassification Report for Decison Trees:\n", report_dt)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_dt, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('confusion_matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)


cm_knn = confusion_matrix(y_test, y_pred_knn)

print("K-Nearest Neighbors:")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn))
print("Confusion Matrix:")
print(cm_knn)

report_knn = classification_report(y_test, y_pred_knn)
print("\nClassification Report for K-Nearest Neighbors:\n", report_knn)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_knn, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - K-Nearest Neighbors')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Gaussian Naive Bayes (GNB)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)


cm_gnb = confusion_matrix(y_test, y_pred_gnb)

print("Gaussian Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_gnb))
print("Precision:", precision_score(y_test, y_pred_gnb))
print("Confusion Matrix:")
print(cm_gnb)

report_gnb = classification_report(y_test, y_pred_gnb)
print("\nClassification Report for Gaussian Naive Bayes:\n", report_gnb)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_gnb, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - Gaussian Naive Bayes')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Multinomial Naive Bayes (MNB)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)

cm_mnb = confusion_matrix(y_test, y_pred_mnb)

print("Multinomial Naive Bayes:")
print("Accuracy:", accuracy_mnb)
print("Precision:", precision_score(y_test, y_pred_mnb))
print("Confusion Matrix:")
print(cm_mnb)

report_mnb = classification_report(y_test, y_pred_mnb)
print("\nClassification Report for Multinomial Naive Bayes:\n", report_mnb)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_mnb, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - Multinomial Naive Bayes')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Bernoulli Naive Bayes (BNB)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)
accuracy_bnb = accuracy_score(y_test, y_pred_bnb)

cm_bnb = confusion_matrix(y_test, y_pred_bnb)

print("Bernoulli Naive Bayes:")
print("Accuracy:", accuracy_bnb)
print("Precision:", precision_score(y_test, y_pred_bnb))
print("Confusion Matrix:")
print(cm_bnb)

report_bnb = classification_report(y_test, y_pred_bnb)
print("\nClassification Report for Bernoulli Naive Bayes:\n", report_bnb)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_bnb, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - Bernoulli Naive Bayes')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming you have calculated accuracy for each model
accuracies = [accuracy_dt, accuracy_knn, accuracy_gnb, accuracy_mnb, accuracy_bnb]
models = ['Decision Trees', 'K-Nearest Neighbors', 'Gaussian Naive Bayes', 'Multinomial Naive Bayes', 'Bernoulli Naive Bayes']
colors = ['blue', 'green', 'orange', 'purple', 'red']

plt.figure(figsize=(15, 12))

bars = plt.bar(models, accuracies, color=colors)

# Adding legend
plt.legend(bars, models, loc='upper left')

plt.title('Model Comparison - Accuracy')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.ylim(0.0, 1.0)  # Set the y-axis limit to ensure proper visualization of accuracy values
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid with 10 values for each hyperparameter
param_grid = {
    'max_depth': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 35, 40, 45],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 14, 16, 18]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# No need to use set_params, the best model is already fitted
best_dt = grid_search.best_estimator_
y_pred_best_dt = best_dt.predict(X_test)

# Evaluate the performance of the best model
accuracy_best_dt = accuracy_score(y_test, y_pred_best_dt)
precision_best_dt = precision_score(y_test, y_pred_best_dt)
cm_best_dt = confusion_matrix(y_test, y_pred_best_dt)

# Print evaluation metrics for the best model
print("\nBest Decision Tree Model:")
print("Accuracy:", accuracy_best_dt)
print("Precision:", precision_best_dt)
print("Confusion Matrix:")
print(cm_best_dt)

# Visualize the confusion matrix for the best model
plt.figure(figsize=(8, 6))
sns.heatmap(cm_best_dt, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix (Best Model)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()

# Define the hyperparameter grid for KNN
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Use GridSearchCV to find the best hyperparameters for KNN
knn_grid_search = GridSearchCV(knn, knn_param_grid, cv=5, scoring='accuracy')
knn_grid_search.fit(X_train, y_train)

# Get the best hyperparameters for KNN
best_knn_params = knn_grid_search.best_params_
print("Best Hyperparameters for K-Nearest Neighbors:", best_knn_params)

# Use the best KNN model for predictions
best_knn = knn_grid_search.best_estimator_
y_pred_best_knn = best_knn.predict(X_test)

# Evaluate the performance of the best KNN model
accuracy_best_knn = accuracy_score(y_test, y_pred_best_knn)
precision_best_knn = precision_score(y_test, y_pred_best_knn)
cm_best_knn = confusion_matrix(y_test, y_pred_best_knn)

# Print evaluation metrics for the best KNN model
print("\nBest K-Nearest Neighbors Model:")
print("Accuracy:", accuracy_best_knn)
print("Precision:", precision_best_knn)
print("Confusion Matrix:")
print(cm_best_knn)

# Visualize the confusion matrix for the best KNN model
plt.figure(figsize=(8, 6))
sns.heatmap(cm_best_knn, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - Best K-Nearest Neighbors Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier()

# Define the hyperparameter grid for KNN
knn_param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Use GridSearchCV to find the best hyperparameters for KNN
knn_grid_search = GridSearchCV(knn, knn_param_grid, cv=5, scoring='accuracy')
knn_grid_search.fit(X_train, y_train)

# Print accuracy for each hyperparameter combination
results = pd.DataFrame(knn_grid_search.cv_results_)
for index, row in results.iterrows():
    print("Parameters:", row['params'])
    print("Mean Accuracy:", row['mean_test_score'])
    print("=====================================")

# Get the best hyperparameters for KNN
best_knn_params = knn_grid_search.best_params_
print("\nBest Hyperparameters for K-Nearest Neighbors:", best_knn_params)

# Use the best KNN model for predictions
best_knn = knn_grid_search.best_estimator_
y_pred_best_knn = best_knn.predict(X_test)

# Evaluate the performance of the best KNN model
accuracy_best_knn = accuracy_score(y_test, y_pred_best_knn)
precision_best_knn = precision_score(y_test, y_pred_best_knn)
cm_best_knn = confusion_matrix(y_test, y_pred_best_knn)

# Print evaluation metrics for the best KNN model
print("\nBest K-Nearest Neighbors Model:")
print("Accuracy:", accuracy_best_knn)
print("Precision:", precision_best_knn)
print("Confusion Matrix:")
print(cm_best_knn)

# Visualize the confusion matrix for the best KNN model
plt.figure(figsize=(8, 6))
sns.heatmap(cm_best_knn, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - Best K-Nearest Neighbors Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming X_train is your training data with 3390 features
# Create a DecisionTreeClassifier instance
dt = DecisionTreeClassifier()

# Define the hyperparameter grid with 10 values for each hyperparameter
param_grid = {
    'max_depth': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 35, 40, 45],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12, 14, 16, 18]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print accuracy for each hyperparameter combination
results = pd.DataFrame(grid_search.cv_results_)
for index, row in results.iterrows():
    print("Parameters:", row['params'])
    print("Mean Accuracy:", row['mean_test_score'])
    print("=====================================")

# Get the best hyperparameters
best_params = grid_search.best_params_
print("\nBest Hyperparameters:", best_params)

# Create a new DecisionTreeClassifier instance with the best hyperparameters
best_dt = DecisionTreeClassifier(max_depth=best_params['max_depth'], 
                                 min_samples_split=best_params['min_samples_split'], 
                                 min_samples_leaf=best_params['min_samples_leaf'])

# Train the best model with the full training data
best_dt.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best_dt = best_dt.predict(X_test)

# Evaluate the performance of the best model
accuracy_best_dt = accuracy_score(y_test, y_pred_best_dt)
precision_best_dt = precision_score(y_test, y_pred_best_dt)
cm_best_dt = confusion_matrix(y_test, y_pred_best_dt)

# Print evaluation metrics for the best model
print("\nBest Decision Tree Model:")
print("Accuracy:", accuracy_best_dt)
print("Precision:", precision_best_dt)
print("Confusion Matrix:")
print(cm_best_dt)

# Visualize the confusion matrix for the best model
plt.figure(figsize=(8, 6))
sns.heatmap(cm_best_dt, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix (Best Model)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
# Multinomial Naive Bayes (MNB)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)

cm_mnb = confusion_matrix(y_test, y_pred_mnb)

print("Multinomial Naive Bayes:")
print("Accuracy:", accuracy_mnb)
print("Precision:", precision_score(y_test, y_pred_mnb))
print("Confusion Matrix:")
print(cm_mnb)

report_mnb = classification_report(y_test, y_pred_mnb)
print("\nClassification Report for Multinomial Naive Bayes:\n", report_mnb)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_mnb, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - Multinomial Naive Bayes')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Accuracy scores for each model
accuracy_scores = [accuracy_mnb, accuracy_best_dt, accuracy_best_knn]

# Precision scores for each model
precision_scores = [precision_score(y_test, y_pred_mnb),
                    precision_best_dt,
                    precision_best_knn]

# Models' names
models = ['Multinomial Naive Bayes', 'Best Decision Tree', 'Best K-Nearest Neighbors']

# Bar graph for accuracy
plt.figure(figsize=(10, 6))
plt.bar(models, accuracy_scores, color=['blue', 'green', 'orange'])
plt.title('Model Comparison - Accuracy')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  # Set the y-axis limit to better compare accuracies
plt.show()


In [None]:
# Bar graph for precision
plt.figure(figsize=(10, 6))
plt.bar(models, precision_scores, color=['blue', 'green', 'orange'])
plt.title('Model Comparison - Precision')
plt.xlabel('Models')
plt.ylabel('Precision')
plt.ylim(0, 1)  # Set the y-axis limit to better compare precision
plt.show()

In [None]:
print("Vectorizer Vocabulary Size:", len(loaded_vectorizer.vocabulary_))


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import joblib

# Function for text transformation
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

# Load the saved vectorizer for prediction
loaded_vectorizer = joblib.load('vectorizer.joblib')

# Assuming df_balanced is already defined and contains 'Revised Content' column
# Apply the same text transformation and vectorization
X_text = loaded_vectorizer.transform(df_balanced['Revised Content'])

# Decision Tree model
# Assuming best_dt was trained using the original vectorizer
# If not, you need to use the vectorizer that was used during training
prediction_dt = best_dt.predict(X_text)

# K-Nearest Neighbors model
prediction_knn = best_knn.predict(X_text)

# Multinomial Naive Bayes model
prediction_mnb = mnb.predict(X_text)

# Function to preprocess and predict
def predict_spam_or_ham(text):
    # Apply the same text transformation to the input text
    preprocessed_text = transform_text(text)

    # Transform the preprocessed text using the loaded vectorizer
    X_text = loaded_vectorizer.transform([preprocessed_text])

    # Use the Decision Tree model for prediction
    prediction_dt = best_dt.predict(X_text)[0]

    # Use the K-Nearest Neighbors model for prediction
    prediction_knn = best_knn.predict(X_text)[0]

    # Use the Multinomial Naive Bayes model for prediction
    prediction_mnb = mnb.predict(X_text)[0]

    # Return predictions
    return {
        'DecisionTree': 'Spam' if prediction_dt == 1 else 'Ham',
        'KNearestNeighbors': 'Spam' if prediction_knn == 1 else 'Ham',
        'MultinomialNaiveBayes': 'Spam' if prediction_mnb == 1 else 'Ham',
    }

# Example usage:
text_to_predict = "Check out this amazing offer! You've won a prize!"
predictions = predict_spam_or_ham(text_to_predict)

print("Text:", text_to_predict)
print("Predictions:")
for model, result in predictions.items():
    print(f"{model}: {result}")
