# Assignment 1
## Preparation
### Muhammed Jassim
### MDS202220

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Loading data

In [2]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

data = load_data('./data/emails.csv')
data.head(10)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [3]:
data.shape

(5728, 2)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


This dataset contains a collection of email text messages, labeled as either spam or not spam. Each email message is associated with a binary label, where $1$ indicates that the email is spam, and $0$ indicates that it is not spam. The dataframe `data` contains two columns: *text*(`str`) and *spam*(`int`). There are $5728$ datapoints in the data.

In [5]:
data['spam'].value_counts()

0    4360
1    1368
Name: spam, dtype: int64

Our dataset contains $4360$ non-spam emails and $1368$ spam emails.

### Preprocessing

In [6]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import  word_tokenize
from nltk.stem import PorterStemmer

def preprocess_data(data):
    data['text'] = data['text'].apply(lambda x: x.lower())  # lowercasing
    data['text'] = data['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation))) # punctuation removal
    data['text'] = data['text'].apply(lambda x: word_tokenize(x))   # tokenization
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda tokens: [token for token in tokens if token not in stop_words])    # stop-word removal
    stemmer = PorterStemmer()
    data['text'] = data['text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens]) # stemming
    data['text'] = data['text'].apply(lambda tokens: ' '.join(tokens))  # joining preprocessed tokens to form text

    return data

data = preprocess_data(data)

### Train/validation/test splitting

In [7]:
def split_data(data):
    train, test = train_test_split(data, test_size=0.2, random_state=42)
    train, validation = train_test_split(train, test_size=0.2, random_state=42)
    return train, validation, test

train, validation, test = split_data(data)

### Storing split data

In [8]:
def store_splits(train, validation, test):
    train.to_csv('./data/train.csv', index=False)
    validation.to_csv('./data/validation.csv', index=False)
    test.to_csv('./data/test.csv', index=False)

store_splits(train, validation, test)

Now we have saved the train, validation and test data into seperate `.csv` files in the `data\` folder.