In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample dataset (spam=1, non-spam=0)
data = [
    ("Buy now and win a prize", 1),
    ("Congratulations, you won!", 1),
    ("Limited offer, act fast", 1),
    ("Normal email content", 0),
    ("Meeting scheduled for tomorrow", 0),
    ("Your invoice is attached", 0)
]

# Split data into texts and labels
texts, labels = zip(*data)

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

# Train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Print accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classify new text samples
new_texts = ["Buy now and win a prize", "Normal email content", "Congratulations, you won!"]
new_X = vectorizer.transform(new_texts)
predictions = clf.predict(new_X)

# Print predictions
for text, pred in zip(new_texts, predictions):
    print(f"'{text}' is classified as {'Spam' if pred == 1 else 'Not Spam'}")


Accuracy: 0.0
'Buy now and win a prize' is classified as Not Spam
'Normal email content' is classified as Not Spam
'Congratulations, you won!' is classified as Not Spam
