In [1]:
import pandas as pd
from collections import Counter
def load_data(file_path):
    """
    Load data from a CSV file into a pandas DataFrame, trying common encodings.
    
    Parameters:
    file_path (str): The path to the CSV file.
    
    Returns:
    pd.DataFrame: DataFrame containing the loaded data, or None if loading fails.
    """
    encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for encoding in encodings_to_try:
        try:
            data = pd.read_csv(file_path, encoding=encoding)
            print(f"Successfully loaded data with encoding: {encoding}")
            return data
        except UnicodeDecodeError:
            print(f"Failed to load with encoding: {encoding}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return None
            
    print("Could not load the file with any of the attempted encodings.")
    return None

# Attempt to load the data with the updated function
df = load_data('spam.csv')

# If successful, you can then display the DataFrame
if df is not None:
    print("\nDataFrame Head:")
    print(df.head())
    print("\nDataFrame Info:")
    df.info()

Successfully loaded data with encoding: utf-8

DataFrame Head:
   Spam                                            Message Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Spam        5572 non-null   object
 1   Message     5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12

In [2]:
df = df[["Spam", "Message"]]  # Adjusting the DataFrame to keep only relevant columns
df 

Unnamed: 0,Spam,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import string 
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stopwords = set(stopwords.words('english'))  # Get English stopwords
porterStemmer = PorterStemmer()  # Initialize the stemmer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def preprocess_text(text):

    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation and non-alphabetic characters
    tokens = [token for token in tokens if token not in string.punctuation]  # Remove punctuation
    
    tokens = [token for token in tokens if token not in stopwords]  # Remove

    
    tokens = [porterStemmer.stem(token) for token in tokens]  # Stem the tokens
     
    processed_text = ' '.join(tokens)  # Join tokens back into a single string

    processed_text = re.sub(r'http\S+|www\S+|https\S+', '', processed_text, flags=re.MULTILINE)  # Remove URLs
    processed_text = re.sub(r'\@\w+|\#', '', processed_text)  # Remove mentions and hashtags
    processed_text = re.sub(r'\d+', '', processed_text)  # Remove digits
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()  # Remove extra whitespace
    return processed_text




In [5]:
df['Processed_Message'] = df['Message'].apply(preprocess_text)  # Apply preprocessing to the 'Message' column

In [6]:
df

Unnamed: 0,Spam,Message,Processed_Message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt may ...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,time tri contact u pound prize claim easi call...
5568,ham,Will �_ b going to esplanade fr home?,b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",piti mood suggest
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...


In [7]:
df['Spam'].value_counts()


Spam
ham     4825
spam     747
Name: count, dtype: int64

In [None]:
X = df['Processed_Message']  # Features
y = df['Spam'].map({'ham':0,'spam':1 })  # Target variable
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(pd.DataFrame(X), y)  # Wrap X in a DataFrame if it's a Series

print(f"Original: {Counter(y)}, Resampled: {Counter(y_resampled)}")

Original: Counter({0: 4825, 1: 747}), Resampled: Counter({0: 4825, 1: 4825})


In [9]:
# !pip install -U scikit-learn imbalanced-learn
# !pip uninstall -y imbalance-learn
# !pip install -U imbalanced-learn

In [10]:
# import sklearn
# import imblearn

# print("scikit-learn version:", sklearn.__version__)
# print("imblearn version:", imblearn.__version__)


In [50]:
from sklearn.model_selection import train_test_split
# 4. Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7720, 1), (1930, 1), (7720,), (1930,))

In [51]:
model_name = 'bert-base-uncased'  # Example model name, adjust as needed

from transformers import BertTokenizer, BertForSequenceClassification
# 5. Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
text = "Don't Like it !"

# inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
tokens = tokenizer.tokenize(text)
print(tokens)

['don', "'", 't', 'like', 'it', '!']


In [53]:
bert_inputs = tokenizer(
    text,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=20
)

print(bert_inputs)

{'input_ids': tensor([[ 101, 2123, 1005, 1056, 2066, 2009,  999,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [54]:
tokens = tokenizer.decode(bert_inputs['input_ids'][0])#, skip_special_tokens=True)
print(tokens)

[CLS] don't like it! [SEP]


In [55]:
from keras.preprocessing.sequence import pad_sequences
# 6. Pad the sequences to the maximum length

input_ids = pad_sequences(bert_inputs['input_ids'], maxlen=20, padding='post', truncating='post')
token_type_ids = pad_sequences(bert_inputs['token_type_ids'], maxlen=20, padding='post', truncating='post')
attention_mask = pad_sequences(bert_inputs['attention_mask'], maxlen=20, padding='post', truncating='post')

In [56]:
print("Input IDs:", input_ids)
print("Token Type IDs:", token_type_ids)
print("Attention Mask:", attention_mask)

Input IDs: [[ 101 2123 1005 1056 2066 2009  999  102    0    0    0    0    0    0
     0    0    0    0    0    0]]
Token Type IDs: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Attention Mask: [[1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]]


In [63]:
max_length = 64  # Define the maximum length for padding
input_ids = [] 
attention_mask = []

for sent in X_train.iloc[:,0]:
    bert_inputs = tokenizer(
        sent,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_length
    )
    
    input_ids.append(bert_inputs['input_ids'])
    attention_mask.append(bert_inputs['attention_mask'])

In [64]:
len(input_ids), len(attention_mask)


(7720, 7720)

In [65]:
i = 0 
for input_id, att_mask in zip( input_ids, attention_mask):
    input_ids[i] = pad_sequences(input_id, maxlen=max_length, padding='post', truncating='post')
    attention_mask[i] = pad_sequences(att_mask, maxlen=max_length, padding='post', truncating='post')
    # print("Input ID:", input_id)
    # print("Attention Mask:", att_mask)
    # print()  # Print a newline for better readability
    i += 1

In [67]:
X_train.iloc[11,0],input_ids[11], attention_mask[11]

('wish great day moji told offer alway speechless offer easili go great length behalf stun exam next friday keep touch sorri',
 array([[  101,  4299,  2307,  2154,  9587,  4478,  2409,  3749,  2632,
          4576, 25146,  3749, 19413, 27572,  2072,  2175,  2307,  3091,
          6852, 24646,  2078, 11360,  2279,  5958,  2562,  3543,  2061,
         18752,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]]),
 array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

In [68]:
# convert input_ids and attention_mask to numpy arrays
import numpy as np
arr_input_ids = np.array(input_ids)
arr_attention_mask = np.array(attention_mask)

In [69]:
import torch

# Convert to torch tensors and squeeze the extra dimension
input_ids_tensor = torch.tensor([ids.squeeze(0) for ids in input_ids])
attention_mask_tensor = torch.tensor([mask.squeeze(0) for mask in attention_mask])
labels_tensor = torch.tensor(y_train.values)

  input_ids_tensor = torch.tensor([ids.squeeze(0) for ids in input_ids])


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# Create PyTorch DataLoaders
batch_size = 32

train_dataloader = DataLoader(X_train, sampler=X_resampled, batch_size=batch_size)

test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

print("\nBERT inputs prepared.")
print(f"Shape of input_ids_train: {input_ids_train.shape}")
print(f"Shape of labels_train: {labels_train.shape}")
print(f"Shape of input_ids_test: {input_ids_test.shape}")
print(f"Shape of labels_test: {labels_test.shape}")

NameError: name 'input_ids_train' is not defined

In [71]:
# this using Tensorflow dcepatered 
# from transformers import TFBertForSequenceClassification

# model_name = 'bert-base-uncased'  # Example model name, adjust as needed
# model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Convert to torch tensors
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
from torch import nn 

class BertClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits

In [77]:
loss_fn = nn.CrossEntropyLoss()  # Define the loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)  # Define the optimizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to("cpu")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [78]:
epochs = 15  # Number of training epochs
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Zero the gradients
    
    # Forward pass
    outputs = model(input_ids=input_ids_tensor.to(device), attention_mask=attention_mask_tensor.to(device))
    
    # Compute loss
    loss = loss_fn(outputs, labels_tensor.to(device))
    
    # Backward pass and optimization
    optimizer.zero_grad()  # Zero the gradients before backward pass
    loss.backward()
    optimizer.step()

    accuracy = (outputs.argmax(dim=1) == labels_tensor.to(device)).float().mean().item()  # Calculate accuracy
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}, Accuracy: {accuracy:.4f}")
    
    with torch.inference_mode():
        model.eval()
        outputs = model(input_ids=input_ids_tensor.to(device), attention_mask=attention_mask_tensor.to(device))
        loss = loss_fn(outputs, labels_tensor.to(device))
        val_accuracy = (outputs.argmax(dim=1) == labels_tensor.to(device)).float().mean().item()
    # Print the loss for each epoch
        if (epoch ) % 3 == 0:
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}, Validation Accuracy: {val_accuracy:.4f}")  
    # print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")  # Print the loss for each epoch


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)