# Assignment 2
## Preparation
### Muhammed Jassim
### MDS202220

In [1]:
import pandas as pd

import string
from nltk.corpus import stopwords
from nltk.tokenize import  word_tokenize
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split

from prettytable import PrettyTable

### Loading data

In [2]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

data = load_data('./data/emails.csv')
data.head(10)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


### Preprocessing

In [3]:
def preprocess_data(data):
    data['text'] = data['text'].apply(lambda x: x.lower())  # lowercasing
    data['text'] = data['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation))) # punctuation removal
    data['text'] = data['text'].apply(lambda x: word_tokenize(x))   # tokenization
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda tokens: [token for token in tokens if token not in stop_words])    # stop-word removal
    stemmer = PorterStemmer()
    data['text'] = data['text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens]) # stemming
    data['text'] = data['text'].apply(lambda tokens: ' '.join(tokens))  # joining preprocessed tokens to form text

    return data

data = preprocess_data(data)

### Train/validation/test splitting

In [4]:
def split_data(data, rand_state):
    train, test = train_test_split(data, test_size=0.2, random_state=rand_state)
    train, validation = train_test_split(train, test_size=0.2, random_state=rand_state)
    return train, validation, test

train, validation, test = split_data(data, 42)

### Storing split data

In [5]:
def store_splits(train, validation, test):
    train.to_csv('./data/train.csv', index=False)
    validation.to_csv('./data/validation.csv', index=False)
    test.to_csv('./data/test.csv', index=False)

store_splits(train, validation, test)

Now we have saved the train, validation and test data into seperate `.csv` files in the `data\` folder.

### Implementing GIT

In [6]:
!dvc init --no-scm --f
!git init

Initialized DVC repository.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>
Initialized empty Git repository in E:/Post Graduation/Semester 4/Applied Machine Learning/Assignments/Assignment 2/.git/


### Track files with DVC

In [7]:
# !git add "E:\Post Graduation\Semester 4\Applied Machine Learning\Assignments\Assignment 2\data\emails.csv" "E:\Post Graduation\Semester 4\Applied Machine Learning\Assignments\Assignment 2\data\\train.csv" "E:\Post Graduation\Semester 4\Applied Machine Learning\Assignments\Assignment 2\data\\validation.csv" "E:\Post Graduation\Semester 4\Applied Machine Learning\Assignments\Assignment 2\data\\test.csv"
!git add ".\data\emails.csv" ".\data\\train.csv" ".\data\\validation.csv" ".\data\\test.csv"

The file will have its original line endings in your working directory


In [8]:
!git commit -m "Add raw data, train, test and validation data after splitting"

[master (root-commit) 39c7f28] Add raw data, train, test and validation data after splitting
 4 files changed, 11460 insertions(+)
 create mode 100644 data/emails.csv
 create mode 100644 data/test.csv
 create mode 100644 data/train.csv
 create mode 100644 data/validation.csv


In [9]:
!git status

On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.dvc/
	.dvcignore
	prepare.ipynb
	train.ipynb

nothing added to commit but untracked files present (use "git add" to track)


### Splitting data with a different random seed

In [10]:
train, validation, test = split_data(data, 21)

In [11]:
store_splits(train, validation, test)

In [12]:
!git status

On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   data/test.csv
	modified:   data/train.csv
	modified:   data/validation.csv

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.dvc/
	.dvcignore
	prepare.ipynb
	train.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


In [13]:
# !git add "E:\Post Graduation\Semester 4\Applied Machine Learning\Assignments\Assignment 2\data\emails.csv" "E:\Post Graduation\Semester 4\Applied Machine Learning\Assignments\Assignment 2\data\\train.csv" "E:\Post Graduation\Semester 4\Applied Machine Learning\Assignments\Assignment 2\data\\validation.csv" "E:\Post Graduation\Semester 4\Applied Machine Learning\Assignments\Assignment 2\data\\test.csv"
!git add ".\data\emails.csv" ".\data\\train.csv" ".\data\\validation.csv" ".\data\\test.csv"

In [14]:
!git commit -m "Updated split dataset with random state 21"

[master aea2c1c] Updated split dataset with random state 21
 3 files changed, 5671 insertions(+), 5671 deletions(-)
 rewrite data/test.csv (70%)
 rewrite data/validation.csv (71%)


In [15]:
!git log --oneline --all

aea2c1c Updated split dataset with random state 21
39c7f28 Add raw data, train, test and validation data after splitting


### Target distribution

In [22]:
!git checkout 39c7f28

Note: switching to '39c7f28'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 39c7f28 Add raw data, train, test and validation data after splitting


In [23]:
train_data = pd.read_csv('data/train.csv')
validation_data = pd.read_csv('data/validation.csv')
test_data = pd.read_csv('data/test.csv')

In [24]:
data = [
    ["Dataset", "0s", "1s"],
    ["Training data", sum(train_data['spam'] == 0), sum(train_data['spam'] == 1)],
    ["Validation data", sum(validation_data['spam'] == 0), sum(validation_data['spam'] == 1)],
    ["Test data", sum(test_data['spam'] == 0), sum(test_data['spam'] == 1)]
]

# Create a PrettyTable instance
table = PrettyTable(data[0])

# Add rows to the table
for row in data[1:]:
    table.add_row(row)

# Set the alignment of columns
for field in data[0]:
    table.align[field] = 'l'

# Print the table
print(table)

+-----------------+------+-----+
| Dataset         | 0s   | 1s  |
+-----------------+------+-----+
| Training data   | 2797 | 868 |
| Validation data | 707  | 210 |
| Test data       | 856  | 290 |
+-----------------+------+-----+


In [25]:
!git checkout aea2c1c

Previous HEAD position was 39c7f28 Add raw data, train, test and validation data after splitting
HEAD is now at aea2c1c Updated split dataset with random state 21


In [26]:
train_data = pd.read_csv('data/train.csv')
validation_data = pd.read_csv('data/validation.csv')
test_data = pd.read_csv('data/test.csv')

In [27]:
data = [
    ["Dataset", "0s", "1s"],
    ["Training data", sum(train_data['spam'] == 0), sum(train_data['spam'] == 1)],
    ["Validation data", sum(validation_data['spam'] == 0), sum(validation_data['spam'] == 1)],
    ["Test data", sum(test_data['spam'] == 0), sum(test_data['spam'] == 1)]
]

# Create a PrettyTable instance
table = PrettyTable(data[0])

# Add rows to the table
for row in data[1:]:
    table.add_row(row)

# Set the alignment of columns
for field in data[0]:
    table.align[field] = 'l'

# Print the table
print(table)

+-----------------+------+-----+
| Dataset         | 0s   | 1s  |
+-----------------+------+-----+
| Training data   | 2785 | 880 |
| Validation data | 706  | 211 |
| Test data       | 869  | 277 |
+-----------------+------+-----+
