In [132]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import re


from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [133]:
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')

In [134]:
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [135]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [136]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [137]:
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [138]:
def clean_data(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)# remove punctuation
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)# remove URLs
    text = re.sub(r'\@\w+|\#','', text) # remove mentions and hashtags
    text = re.sub(r'\d+', '', text)  # remove numbers
    # lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [139]:
df['text'] = df['text'].apply(clean_data)

In [140]:
df.head()

Unnamed: 0,target,text
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though


In [141]:
category_counts = df['target'].value_counts()
print(category_counts)

target
ham     4825
spam     747
Name: count, dtype: int64


In [142]:
mapper = {'ham': 0, 'spam': 1}
df['target'] = df['target'].map(mapper)
category_counts = df['target'].value_counts()
print(category_counts)

target
0    4825
1     747
Name: count, dtype: int64


In [144]:
df['num_characters'] = df['text'].apply(len)
df['num_words'] = df['text'].apply(lambda text: len(word_tokenize(text)))
df['num_sentences'] = df['text'].apply(lambda text: len(sent_tokenize(text)))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\my/nltk_data'
    - 'c:\\Users\\my\\anaconda3\\nltk_data'
    - 'c:\\Users\\my\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\my\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\my\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
vectorizer = TfidfVectorizer()

X_text = vectorizer.fit_transform(df['text'])
X_features = df[['num_characters', 'num_words', 'num_sentences']].values
X = np.hstack((X_text.toarray(), X_features))

KeyError: "None of [Index(['num_characters', 'num_words', 'num_sentences'], dtype='object')] are in the [columns]"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, df['target'], test_size=0.2, random_state=42)

In [None]:
base_models = [
    ('dt', DecisionTreeClassifier(max_depth=5, random_state=42)),  # Decision Tree
    ('knn', KNeighborsClassifier(n_neighbors=5)),                 # KNN
    ('svm', SVC(probability=True, kernel='linear', random_state=42))  # SVM
]

# Define the meta-learner (final model)
meta_learner = LogisticRegression()

# Create the Stacking model
stacking_model = StackingClassifier(
    estimators=base_models,  # Base models
    final_estimator=meta_learner,  # Meta model
    passthrough=False,  # Use original features + predictions from base models
    cv=5  # Cross-validation
)

# Train the Stacking model on the training data
stacking_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'n     n   h e   e   b   l l   h e r e         c h e c k   e   u   n       h l f   n k e'