In [14]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
data = pd.read_csv('./spam.csv')

In [16]:
data['Message'] = data['Message'].apply(lambda x: x.replace('\r\n' , ' '))

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [18]:
# Ruduce number of terms i.e steamer would be steam or running would be run
steamer = PorterStemmer()
# steamer.stem("Running")
corpus = []

stopwords_set = set(stopwords.words('english'))

for i in range(len(data)):
    text = data['Message'].iloc[i].lower()
    text = text.translate(str.maketrans('','',string.punctuation)).split()
    text = [steamer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)
    corpus.append(text)

In [24]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus).toarray()
y = data.Category

X_train ,X_test, y_train , y_test = train_test_split(X , y ,test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Label'], test_size=0.3, random_state=42)  # Replace placeholders and add random state



In [25]:
clf = RandomForestClassifier(n_jobs=1)

clf.fit(X_train , y_train)

In [72]:
clf.score(X_test , y_test)

from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9856459330143541


In [76]:
emailToClassify = data.Message.values[8]
emailToClassify

'winner valu network custom select receivea £900 prize reward claim call 09061701461 claim code kl341 valid 12 hour'

In [77]:
emailText = emailToClassify.lower().translate(str.maketrans('' , '' , string.punctuation)).split()
emailText = [steamer.stem(word) for word in text if word not in stopwords_set]
emailText = ' '.join(emailText)

emailCorpus = [emailText]

XEmail = vectorizer.transform(emailCorpus)

In [78]:
clf.predict(XEmail)

array(['ham'], dtype='<U4')

In [82]:
data.Category.iloc[33]

'ham'

In [71]:
# Assuming emailToClassify is the new email content
emailText = emailToClassify.lower().translate(str.maketrans('', '', string.punctuation)).split()
emailText = [steamer.stem(word) for word in emailText if word not in stopwords_set]
emailText = ' '.join(emailText)
emailCorpus = [emailText]

XEmail = vectorizer.transform(emailCorpus)
prediction = clf.predict(XEmail)[0]  # Assuming you only want to classify one email

if prediction == 1:
  print("This email is classified as spam.")
else:
  print("This email is classified as non-spam.")


This email is classified as non-spam.


In [83]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data
data = pd.read_csv('./spam.csv')  # Replace "your_data.csv" with your actual file path

# Pre-processing
def preprocess_text(text):
  """
  Preprocesses text data for spam classification.

  Args:
      text: The text to preprocess.

  Returns:
      The preprocessed text.
  """
  text = text.lower()
  text = text.translate(str.maketrans('', '', string.punctuation)).split()
  stopwords_set = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  text = [stemmer.stem(word) for word in text if word not in stopwords_set]
  return ' '.join(text)

data['Message'] = data['Message'].apply(preprocess_text)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size=0.3, random_state=42)

# Feature Extraction
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define Classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(n_jobs=1),
    "Naive Bayes": MultinomialNB()
}

# Train Classifiers
for name, clf in classifiers.items():
  clf.fit(X_train, y_train)

# Evaluate Classifiers
for name, clf in classifiers.items():
  y_pred = clf.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, pos_label="spam")  # Assuming "spam" is positive
  recall = recall_score(y_test, y_pred, pos_label="spam")
  f1 = f1_score(y_test, y_pred, pos_label="spam")

  print(f"\n{name} Accuracy:", accuracy)
  print(f"{name} Precision:", precision)
  print(f"{name} Recall:", recall)
  print(f"{name} F1-score:", f1)

# Classify a New Email
def classify_new_email(email_text):
  """
  Classifies a new email as spam or ham.

  Args:
      email_text: The text of the new email.

  Returns:
      A string indicating the classification ("spam" or "ham").
  """
  email_text = preprocess_text(email_text)
  email_corpus = [email_text]
  X_email = vectorizer.transform(email_corpus)

  # Choose the best performing classifier based on your evaluation results
  best_clf = classifiers["Random Forest"]  # Replace with the best performing classifier

  prediction = best_clf.predict(X_email)[0]
  if prediction == 1:  # Assuming "spam" is encoded as 1
    return "spam"
  else:
    return "ham"





Random Forest Accuracy: 0.9778708133971292
Random Forest Precision: 1.0
Random Forest Recall: 0.8348214285714286
Random Forest F1-score: 0.9099756690997567

Naive Bayes Accuracy: 0.9856459330143541
Naive Bayes Precision: 0.9629629629629629
Naive Bayes Recall: 0.9285714285714286
Naive Bayes F1-score: 0.9454545454545454


In [91]:
# Example Usage: Replace "your_new_email" with the actual email content
new_email = "fuck off man pay me 100 # 563 i will repay as s56oon as possuble."
classification = classify_new_email(new_email)
print(f"\nNew Email Classification: {classification}")


New Email Classification: ham
