In [None]:
pip install wordcloud nltk sklearn pandas matplotlib seaborn xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 3.4 MB/s eta 0:00:17
   -- ------------------------------------- 4.2/56.8 MB 14.0 MB/s eta 0:00:04
   ------- -------------------------------- 10.5/56.8 MB 20.4 MB/s eta 0:00:03
   ------------ --------------------------- 17.8/56.8 MB 25.0 MB/s eta 0:00:02
   ------------------ --------------------- 26.7/56.8 MB 29.2 MB/s eta 0:00:02
   ---------------------- ----------------- 32.0/56.8 MB 29.0 MB/s eta 0:00:01
   ---------------------------- ----------- 40.4/56.8 MB 30.5 MB/s eta 0:00:01
   --------------------------------- ------ 47.7/56.8 MB 31.3 MB/s eta 0:00:01
   ---------------------------------------  56.4/56.8 MB 32.6 MB/s eta 0:00:01
   ---------------------------------------- 56.8/56.8 MB 31.8 MB/s eta 0:00


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [38]:
# Importing neccessary libraries
import numpy as np                  # For numerical operations
import matplotlib.pyplot as plt     # For plotting graphs
import pandas as pd                 # For data manipulation
%matplotlib inline

# Importing the wordcloud library
from wordcloud import WordCloud

# Importing NLTK libraries for natural language processing
import nltk
from nltk.corpus import stopwords # For stopwords

# Downloading NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
df = pd.read_csv('spam.csv') # Reading the CSV file
df.head() # Displaying the first few rows of the dataframe

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [40]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True) # Dropping unnecessary columns
df.head() # Displaying the first few rows of the dataframe after dropping columns

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
df.rename(columns = {'v1':'target', 'v2':'text'}, inplace=True) # Renaming columns for better understanding
df.head() # Displaying the first few rows of the dataframe after renaming columns

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Data Preprocessing

In [42]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['target'] = le.fit_transform(df['target']) # Encoding target labels (ham:0, spam:1)
df.head() # Displaying the first few rows of the dataframe after encoding target labels

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
df.duplicated().sum() # Checking for duplicate rows

np.int64(403)

In [44]:
df.shape # Displaying the shape of the dataframe after removing duplicates

(5572, 2)

In [45]:
df = df.drop_duplicates(keep='first') # Dropping duplicate rows
df.shape # Displaying the shape of the dataframe after removing duplicates

(5169, 2)

Feature Engineering

In [46]:
# Importing PorterStemmer for stemming
from nltk.stem.porter import PorterStemmer

# Importing String library
import string

# Creatingg an instance of PorterStemmer
ps = PorterStemmer()


In [47]:
# Lower case transfermation and text preprocessing function
def transform_text(text):
    
    text = text.lower() # Converting text to lowercase
    
    text = nltk.word_tokenize(text) # Tokenizing the text
    
    y = []
    
    for i in text:
        if i.isalnum(): # Removing special characters
            y.append(i)
    
    text = y[:]
    
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation: # Removing stopwords and punctuation
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i)) # Stemming the words
    
    return " ".join(y) # Joining the words back to form

In [48]:
transform_text("Hello everyone, I am learning Natural Language Processing. This is an example of text preprocessing!!!") # Testing the transform_text function

'hello everyon learn natur languag process exampl text preprocess'

In [49]:
df['transformed_text'] = df['text'].apply(transform_text) # Applying the transform_text function to the text column
df.head() # Displaying the first few rows of the dataframe after text transformation

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [50]:
from sklearn.feature_extraction.text import CountVectorizer # For Bag of Words model
from sklearn.feature_extraction.text import TfidfVectorizer # For TF-IDF model

tfidf = TfidfVectorizer(max_features=500) # Creating an instance of TfidfVectorizer with a maximum of 500 features



In [51]:

x = tfidf.fit_transform(df['transformed_text']).toarray() # Fitting and transforming the transformed_text column
y = df['target'].values # Target variable

Train Test Split

In [52]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2) # Splitting the data into training and testing sets

Model Training

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

In [56]:
svc = SVC() # Creating an instance of SVC
knc = KNeighborsClassifier() # Creating an instance of KNeighborsClassifier
mnb = MultinomialNB() # Creating an instance of MultinomialNB
dtc = DecisionTreeClassifier() # Creating an instance of DecisionTreeClassifier
lrc = LogisticRegression() # Creating an instance of LogisticRegression
rfc = RandomForestClassifier() # Creating an instance of RandomForestClassifier
abc = AdaBoostClassifier() # Creating an instance of AdaBoostClassifier
gbdt = GradientBoostingClassifier() # Creating an instance of GradientBoostingClassifier
bc = BaggingClassifier() # Creating an instance of BaggingClassifier
etc = ExtraTreesClassifier() # Creating an instance of ExtraTreesClassifier
xgb = XGBClassifier() # Creating an instance of XGBClassifier

In [57]:
clfs = {
    'SVC': svc,
    'KNeighborsClassifier': knc,
    'MultinomialNB': mnb,
    'DecisionTreeClassifier': dtc,
    'LogisticRegression': lrc,
    'RandomForestClassifier': rfc,
    'AdaBoostClassifier': abc,
    'GradientBoostingClassifier': gbdt,
    'BaggingClassifier': bc,
    'ExtraTreesClassifier': etc,
    'XGBClassifier': xgb
}

Model Evaluation

In [58]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

def train_classifier(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train) # Fitting the classifier on the training data
    y_pred = clf.predict(x_test) # Predicting the target variable on the test data
    accuracy = accuracy_score(y_test, y_pred) # Calculating accuracy
    precision = precision_score(y_test, y_pred) # Calculating precision
    recall = recall_score(y_test, y_pred) # Calculating recall
    f1 = f1_score(y_test, y_pred) # Calculating f1 score
    return accuracy, precision, recall, f1 # Returning the evaluation metrics

In [59]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
model_names = []

for name, clf in clfs.items():
    accuracy, precision, recall, f1 = train_classifier(clf, x_train, y_train, x_test, y_test) # Training the classifier and getting the evaluation metrics
    model_names.append(name) # Appending the model name
    accuracy_scores.append(accuracy) # Appending the accuracy score
    precision_scores.append(precision) # Appending the precision score
    recall_scores.append(recall) # Appending the recall score
    f1_scores.append(f1) # Appending the f1 score

performance_df = pd.DataFrame({'Model': model_names, 'Accuracy': accuracy_scores, 'Precision': precision_scores, 'Recall': recall_scores, 'F1 Score': f1_scores}) # Creating a dataframe to display the performance of each model
performance_df # Displaying the performance dataframe

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,SVC,0.974855,0.982759,0.826087,0.897638
1,KNeighborsClassifier,0.927466,1.0,0.456522,0.626866
2,MultinomialNB,0.970986,0.965517,0.811594,0.88189
3,DecisionTreeClassifier,0.950677,0.827068,0.797101,0.811808
4,LogisticRegression,0.966151,0.963964,0.775362,0.859438
5,RandomForestClassifier,0.972921,0.943548,0.847826,0.89313
6,AdaBoostClassifier,0.923598,0.873418,0.5,0.635945
7,GradientBoostingClassifier,0.96325,0.946429,0.768116,0.848
8,BaggingClassifier,0.959381,0.893443,0.789855,0.838462
9,ExtraTreesClassifier,0.975822,0.952,0.862319,0.904943
