In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re


In [6]:
# Load your dataset (assumes a CSV file with 'email_text' and 'label' columns)
data = pd.read_csv('/content/email_text.csv')  # Replace with your dataset path

# Display the first few rows
print(data.head())


   label                                               text
0      0  user id enrondlr pw bnawebescapenumber origina...
1      0  hi chris tonight we are rolling out a new repo...
2      0  rika r these new original message from thomas ...
3      0  john gerald we are currently trading under gtc...
4      0  gerald and stacy attached is a worksheet for a...


In [9]:
def preprocess_email(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the email_text column
data['cleaned_text'] = data['text'].apply(preprocess_email)


In [10]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words as features

# Transform the cleaned text into numerical data
X = vectorizer.fit_transform(data['cleaned_text']).toarray()

# Target variable
y = data['label']


In [11]:
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (44060, 5000)
Testing data shape: (11015, 5000)


In [12]:
# Initialize Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

print("Model training completed!")


Model training completed!


In [13]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9740354062641852
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      6067
           1       0.98      0.97      0.97      4948

    accuracy                           0.97     11015
   macro avg       0.97      0.97      0.97     11015
weighted avg       0.97      0.97      0.97     11015



In [14]:
import joblib

# Save the model
joblib.dump(model, 'phishing_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model and vectorizer saved!")


Model and vectorizer saved!


In [15]:
# Load the saved model and vectorizer
loaded_model = joblib.load('phishing_model.pkl')
loaded_vectorizer = joblib.load('vectorizer.pkl')

# Example email
new_email = "Urgent! Update your account details to avoid suspension."

# Preprocess and vectorize
cleaned_email = preprocess_email(new_email)
email_features = loaded_vectorizer.transform([cleaned_email]).toarray()

# Predict
prediction = loaded_model.predict(email_features)
print("Prediction (1=Phishing, 0=Legitimate):", prediction[0])


Prediction (1=Phishing, 0=Legitimate): 0
