# Fineturning for classification

## Preparing the dataset

In [2]:
import requests
import zipfile
import os
from pathlib import Path

# Archive of the UCI with lots of datasets. We will use the SMS Spam Collection dataset.
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"


# function to download and unzip the spam data archive
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the archive
    response = requests.get(url, stream=True, timeout=60)
    response.raise_for_status()
    with open(zip_path, "wb") as out_file:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                out_file.write(chunk)

    # Unzipping the archive
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv (tab-separated values) file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"Archive downloaded and saved as {data_file_path}")


# add a try-except block to handle the case where the primary URL fails
try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (requests.exceptions.RequestException, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)



sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [3]:
import pandas as pd

In [5]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df.head()


Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.shape
# ~5574 rows and 2 columns

(5572, 2)

In [None]:
df["Label"].value_counts()



Label
ham     4825
spam     747
Name: count, dtype: int64

In [9]:
df.groupby("Label").describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


Balance the data to make it easier. To read more about model evaluation see https://arxiv.org/pdf/1811.12808

In [24]:
# Function to balance the data (using random_state to make the results comparable to the book/youtube video)
def create_balanced_dataset(df):
    # Separate the spam and ham messages
    spam_messages = df[df["Label"] == "spam"]
    ham_messages = df[df["Label"] == "ham"]

    # Balance the data by undersampling the spam messages
    if len(spam_messages) > len(ham_messages):
        balanced_df = pd.concat([ham_messages, spam_messages.sample(n=len(ham_messages), random_state=123)])
    else:
        balanced_df = pd.concat([spam_messages, ham_messages.sample(n=len(spam_messages), random_state=123)])
    return balanced_df

balanced_df = create_balanced_dataset(df)
balanced_df["Label"].value_counts()

Label
spam    747
ham     747
Name: count, dtype: int64

In [25]:
# Change the labels to 0 and 1
# ham -> 0
# spam -> 1
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
print(balanced_df.head()) 
print(balanced_df.tail())


    Label                                               Text
2       1  Free entry in 2 a wkly comp to win FA Cup fina...
5       1  FreeMsg Hey there darling it's been 3 week's n...
8       1  WINNER!! As a valued network customer you have...
9       1  Had your mobile 11 months or more? U R entitle...
11      1  SIX chances to win CASH! From 100 to 20,000 po...
      Label                                               Text
4707      0  Wow so healthy. Old airport rd lor. Cant thk o...
3293      0             Dear good morning how you feeling dear
1278      0              Dont put your phone on silent mode ok
4079      0                Gam gone after outstanding innings.
4468      0  She said,'' do u mind if I go into the bedroom...


In [37]:
# Split the data into training and test sets
def random_split(df, train_frac=0.7, val_frac=0.1):
  # Shuffle the data
  df = df.sample(frac=1, random_state=123).reset_index(drop=True)
  # frac=1 means shuffle the data, drop=True means drop the old index

  # Calculate the split indices 
  train_end = int(train_frac*len(df))
  val_end = int((train_end +int(len(df) * val_frac)))

  # Split the data into training, validation, and test sets
  train_df = df.iloc[:train_end]
  val_df = df.iloc[train_end:val_end]
  test_df = df.iloc[val_end:]

  return train_df, val_df, test_df


train_df, val_df, test_df = random_split(balanced_df, train_frac=0.7, val_frac=0.1)

print(train_df.head())
print(train_df.tail())
print(val_df.head())
print(val_df.tail())
print(test_df.head())
print(test_df.tail())

   Label                                               Text
0      1  U have a secret admirer who is looking 2 make ...
1      1  We tried to contact you re your reply to our o...
2      0             From tomorrow onwards eve 6 to 3 work.
3      0  I don know account details..i will ask my mom ...
4      0                         Just sleeping..and surfing
      Label                                               Text
1040      0  Good morning pookie pie! Lol hope I didn't wak...
1041      0                              I'm meeting Darren...
1042      0  I don't think I can get away for a trek that l...
1043      0                Just haven't decided where yet eh ?
1044      1  BIG BROTHER ALERT! The computer has selected u...
      Label                                               Text
1045      0              Give me a sec to think think about it
1046      0  Yup but it's not giving me problems now so may...
1047      1  As a valued customer, I am pleased to advise y...
1048      

In [38]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)





(1045, 2)
(149, 2)
(300, 2)


In [None]:
train_df.to_csv("train_df.csv", index=False)
val_df.to_csv("val_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)

## Setting up the data loaders