1. **Imports & Dependencies:**

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

2. **Loading the Dataset:**

In [13]:
file_path = r"..\Assignment 1\SMSSpamCollection"
# Load the file as a tab-separated values (TSV) file
df = pd.read_csv(file_path, sep='\t', header=None, names=["Label", "Message"])

df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Encoding the Labels:**

In [14]:
print(df["Label"].value_counts())
df['Label'] = df['Label'].map({'ham': 0, 'spam': 1})
df.head()

Label
ham     4825
spam     747
Name: count, dtype: int64


Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


- **Data Cleaning Steps:** If stopwords and punctuation removal are intended, a clear function should be defined for preprocessing text.

In [15]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()  # Tokenize
    words = [word for word in words if word not in stop_words and len(word) >1]  # Remove stopwords
    return " ".join(words)

In [16]:
df['Processed_Message'] = df['Message'].apply(preprocess_text)
df.head()

Unnamed: 0,Label,Message,Processed_Message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis great wo...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts 21s...
3,0,U dun say so early hor... U c already then say...,dun say early hor already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [17]:
df_final = df[['Label', 'Processed_Message']].rename(columns={'Processed_Message': 'Message'})
df_final.drop_duplicates(inplace=True)
df_final.dropna().reset_index(drop=True)
df_final.head()

Unnamed: 0,Label,Message
0,0,go jurong point crazy available bugis great wo...
1,0,ok lar joking wif oni
2,1,free entry wkly comp win fa cup final tkts 21s...
3,0,dun say early hor already say
4,0,nah dont think goes usf lives around though


In [19]:
train, test = train_test_split(df_final, test_size=0.2, random_state=21)
train, val = train_test_split(train, test_size=0.25, random_state=21)
train.to_csv('train.csv', index=False)
val.to_csv('validation.csv', index=False)
test.to_csv('test.csv', index=False)

In [24]:
!git init

Initialized empty Git repository in E:/Sem 4/AML/AppliedMachineLearning/Assignment 2/.git/


In [25]:
!dvc init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


In [27]:
!dvc add train.csv
!dvc add validation.csv
!dvc add test.csv


To track the changes with git, run:

	git add train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph




To track the changes with git, run:

	git add validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph




To track the changes with git, run:

	git add test.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



In [None]:
!git add .
!git commit -m "First data version with seed 21"
!dvc push  # If using remote storage

# Create a tag for easier reference
!git tag -a "data-v1" -m "First data version"
