In [13]:
import pandas as pd
from collections import Counter
def load_data(file_path):
    """
    Load data from a CSV file into a pandas DataFrame, trying common encodings.
    
    Parameters:
    file_path (str): The path to the CSV file.
    
    Returns:
    pd.DataFrame: DataFrame containing the loaded data, or None if loading fails.
    """
    encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for encoding in encodings_to_try:
        try:
            data = pd.read_csv(file_path, encoding=encoding)
            print(f"Successfully loaded data with encoding: {encoding}")
            return data
        except UnicodeDecodeError:
            print(f"Failed to load with encoding: {encoding}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return None
            
    print("Could not load the file with any of the attempted encodings.")
    return None

# Attempt to load the data with the updated function
df = load_data('spam.csv')

# If successful, you can then display the DataFrame
if df is not None:
    print("\nDataFrame Head:")
    print(df.head())
    print("\nDataFrame Info:")
    df.info()

Successfully loaded data with encoding: utf-8

DataFrame Head:
   Spam                                            Message Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Spam        5572 non-null   object
 1   Message     5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12

In [14]:
df = df[["Spam", "Message"]]  # Adjusting the DataFrame to keep only relevant columns
df 

Unnamed: 0,Spam,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [15]:
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import string 
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stopwords = set(stopwords.words('english'))  # Get English stopwords
porterStemmer = PorterStemmer()  # Initialize the stemmer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def preprocess_text(text):

    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation and non-alphabetic characters
    tokens = [token for token in tokens if token not in string.punctuation]  # Remove punctuation
    
    tokens = [token for token in tokens if token not in stopwords]  # Remove

    
    tokens = [porterStemmer.stem(token) for token in tokens]  # Stem the tokens
     
    processed_text = ' '.join(tokens)  # Join tokens back into a single string

    processed_text = re.sub(r'http\S+|www\S+|https\S+', '', processed_text, flags=re.MULTILINE)  # Remove URLs
    processed_text = re.sub(r'\@\w+|\#', '', processed_text)  # Remove mentions and hashtags
    processed_text = re.sub(r'\d+', '', processed_text)  # Remove digits
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()  # Remove extra whitespace
    return processed_text




In [17]:
df['Processed_Message'] = df['Message'].apply(preprocess_text)  # Apply preprocessing to the 'Message' column

In [18]:
df

Unnamed: 0,Spam,Message,Processed_Message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt may ...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,time tri contact u pound prize claim easi call...
5568,ham,Will �_ b going to esplanade fr home?,b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",piti mood suggest
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...


In [19]:
df['Spam'].value_counts()


Spam
ham     4825
spam     747
Name: count, dtype: int64

In [28]:
X = df['Processed_Message']  # Features
y = df['Spam'].map({'ham':0,'spam':1 })  # Target variable
from sklearn.feature_extraction.text import TfidfVectorizer
# 1. Initialize TfidfVectorizer
# You can adjust parameters like max_features, min_df, max_df as needed
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Example: consider top 5000 words

# 2. Transform the text data into numerical TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Now X_tfidf is a sparse matrix of numerical features.
# SMOTE can work with sparse matrices.

from imblearn.over_sampling import SMOTE
# 3. Apply SMOTE to the numerical (TF-IDF) data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

print(f"Resampled dataset shape: {Counter(y_resampled)}")


Resampled dataset shape: Counter({0: 4825, 1: 4825})


In [29]:
# !pip install -U scikit-learn imbalanced-learn
# !pip uninstall -y imbalance-learn
# !pip install -U imbalanced-learn

In [30]:
# import sklearn
# import imblearn

# print("scikit-learn version:", sklearn.__version__)
# print("imblearn version:", imblearn.__version__)


In [31]:
from sklearn.model_selection import train_test_split
# 4. Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [32]:
from keras.models import Sequential
from keras.layers import Dense, Dropout


model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))  # Add dropout for regularization
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))  # Assuming 32 classes for the output layer
model.add(Dense(1, activation='sigmoid'))  # For binary classification (spam or not spam)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [33]:
model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.7629 - loss: 0.4191 - val_accuracy: 0.9850 - val_loss: 0.0412
Epoch 2/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9881 - loss: 0.0286 - val_accuracy: 0.9902 - val_loss: 0.0246
Epoch 3/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9954 - loss: 0.0125 - val_accuracy: 0.9912 - val_loss: 0.0261
Epoch 4/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9965 - loss: 0.0074 - val_accuracy: 0.9896 - val_loss: 0.0298
Epoch 5/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9985 - loss: 0.0049 - val_accuracy: 0.9922 - val_loss: 0.0257
Epoch 6/15
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9980 - loss: 0.0043 - val_accuracy: 0.9834 - val_loss: 0.0569
Epoch 7/15
[1m242/242[0m 

<keras.src.callbacks.history.History at 0x23377cda590>

In [34]:
massage = "Congratulations! You've won a free ticket to Bahamas!"

# Preprocess the new message
new_message_processed = preprocess_text(massage)
new_message_tfidf = tfidf_vectorizer.transform([new_message_processed])
# Predict using the trained model
prediction = model.predict(new_message_tfidf)
if prediction[0][0] > 0.5:
    print("The message is classified as spam.")
else:
    print("The message is classified as ham (not spam).")
    

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 344ms/step
The message is classified as spam.


In [35]:
model.evaluate(X_test, y_test)

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9908 - loss: 0.0413


[0.04281359910964966, 0.9901554584503174]