In [1]:
import pandas as pd
import numpy as np

In [2]:
spam_data = pd.read_csv("spam.csv", encoding='latin-1')

In [3]:
spam_data.head()

Unnamed: 0,Target,Text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
spam_data = spam_data.loc[:,:"Text"]          # To retain only first two columns until Text column

In [5]:
spam_data.head()

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
spam_data_small_text = list(spam_data.head()["Text"])           # Make the small dataset from the large dataset for easy usage.
spam_data_small_target = list(spam_data.head()["Target"])

In [7]:
spam_data_small_text

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though"]

In [8]:
spam_data_small_target

['ham', 'ham', 'spam', 'ham', 'ham']

In [None]:
# Bag of words approach

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
cv = CountVectorizer(stop_words='english', min_df=0.005)   # stop words are frequently used word in most of the rows of dataset. Default stop word-english
cv.fit(list(spam_data["Text"]))                            # min_df is minimum document frequency, which is used to reduce the number of columns.
output = cv.transform(list(spam_data["Text"]))


tfidf = TfidfVectorizer(stop_words='english', min_df=0.005)
tfidf.fit(list(spam_data["Text"]))
output = tfidf.transform(list(spam_data["Text"]))

In [11]:
output = output.toarray()
columns = cv.get_feature_names_out()

X = pd.DataFrame(output, columns=columns)
y = spam_data["Target"]

In [None]:
# Create new features
# Count of No. of Digits

In [12]:
def fn_count_digits(msg):
    return sum(char.isdigit() for char in msg)

count_digits = spam_data["Text"].apply(lambda x : fn_count_digits(x))

In [13]:
X["count_digits"] = count_digits

In [None]:
# Train Test Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Model Building

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [17]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [18]:
preds = clf.predict(X_test)

In [19]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1587
        spam       0.97      0.94      0.95       252

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.97      1839
weighted avg       0.99      0.99      0.99      1839



In [None]:
# Testing on sample examples

In [20]:
test_samples = ["WINNER! Congratulations on winning a free gift. Send message to 58585 to redeem your gift",
                "Hey gimme a call me on 78996797 when you are free. Need to talk something important"]

In [21]:
test_samples

['WINNER! Congratulations on winning a free gift. Send message to 58585 to redeem your gift',
 'Hey gimme a call me on 78996797 when you are free. Need to talk something important']

In [22]:
test_samples_output = tfidf.transform(test_samples)
test_samples_output = pd.DataFrame(test_samples_output.toarray(), columns = cv.get_feature_names_out())
test_samples_output['count_digits'] = [fn_count_digits(msg) for msg in test_samples]

In [23]:
test_samples_output.shape

(2, 288)

In [24]:
test_samples_preds = clf.predict(test_samples_output)
test_samples_probs = clf.predict_proba(test_samples_output)

In [25]:
print(test_samples_preds)
print(test_samples_probs)

['spam' 'spam']
[[0.33625 0.66375]
 [0.42    0.58   ]]
