# Nyrah Balabanian
## Assignment 1

In [31]:
#import packages for data preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


#import packages for tokenization
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [59]:
data = []

# Open the file in read mode
with open('SMSSpamCollection', 'r') as file:
    # Read the contents of the file
    for line in file:
        # Parse the line (by tab)
        parsed_line = line.strip().split('	')
        
        # Add the parsed data to the list
        data.append(parsed_line)

# Create a DataFrame from the list
df_raw = pd.DataFrame(data, columns=['label', 'text']) 



In [60]:
df = df_raw.copy()

In [4]:
df.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [None]:
unique_values = df_raw['label'].unique()
unique_values

In [61]:
# Map the unique values to 0 and 1
mapping = {'ham': 0, 'spam': 1}
df['label'] = df['label'].map(mapping)

In [62]:
# Remove URLs
df['text'] = df['text'].str.replace(r'http[s]?://\S+', '', regex=True)

# Remove punctuation and numbers
df['text'] = df['text'].str.replace(r"[^a-zA-Z' ]", '', regex=True)

# Convert text to lower case
df['text'] = df['text'].str.lower()

In [36]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scoop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scoop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
# Tokenization function
def tokenize_text(text):
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in stemmed_tokens if word not in stop_words]
    return filtered_tokens


In [64]:
y=df['label']
X=df['text']

In [65]:
# Create the TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize_text, preprocessor=lambda x: x, lowercase=False)

# Fit the vectorizer to the training data
X_tfidf = vectorizer.fit_transform(X)



In [66]:
X_tfidf.shape

(5574, 7141)

In [None]:
X_tfidf.shape

In [41]:
X_tfidf.todense()[:5]

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [67]:
#tokenize data first before splitting the data

# Splitting the dataset into the Training set and Test set
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size = 0.2, random_state = 42)

# Splitting the Training set further into Training and Validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.2, random_state = 42) 

In [68]:
#data distribution in a table
# Calculate the sizes of each split
train_size = X_train.shape[0]
val_size = X_val.shape[0]
test_size = X_test.shape[0]

# Create a DataFrame
data = {
    'Dataset': ['Training', 'Validation', 'Test'],
    'Number of Samples': [train_size, val_size, test_size]
}
data_distribution = pd.DataFrame(data)

# Display the DataFrame
data_distribution

Unnamed: 0,Dataset,Number of Samples
0,Training,3567
1,Validation,892
2,Test,1115


In [None]:
df.head(10)

In [None]:
df_raw.head(10)

In [None]:
X_train.shape

In [15]:
test = X_tfidf.todense()
print(test[:1].max())

0.3336893793070944


In [45]:
np.random.seed(42)
d = X_tfidf.shape[1]  # Number of features in X
w = np.random.randn(d, 1)  # Initialize weights
b = 0  # Initialize bias

In [46]:
def sigmoid(z):    
    #sigmoid outputs a number between 0 and 1
    return 1 / (1 + np.exp(-z))

In [47]:
from scipy import sparse

def compute_predictions(X, w, b):
    # Ensure that w is a dense array for matrix multiplication
    w_dense = w if isinstance(w, np.ndarray) else w.toarray()

    # Perform sparse matrix multiplication and add bias
    z = X.dot(w_dense) + b

    # Apply the sigmoid function
    return 1 / (1 + np.exp(-z))


In [48]:
def compute_loss(y, y_hat, w, lambda_reg):
    # Convert y to a numpy array if it's a Pandas Series
    if isinstance(y, pd.Series):
        y = y.values
    
    n = len(y)
    binary_cross_entropy_loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    L2 = binary_cross_entropy_loss + lambda_reg * np.sum(w ** 2) / (2 * n)
    return L2


In [104]:
def compute_gradients(X_train, y_train, y_hat, w, lambda_reg):
    y_train_np = y_train.to_numpy() if isinstance(y_train, (pd.Series, pd.DataFrame)) else y_train
    y_hat = y_hat.flatten() if y_hat.ndim > 1 else y_hat
    n = len(y_train_np)

    # For sparse matrix multiplication, use the dot method of the sparse matrix
    dw = -(X_train.T.dot(y_train_np - y_hat)) / n 

    # Flatten w for the addition operation
    w_flattened = w.flatten() if w.ndim > 1 else w

    # Adding the regularization term
    dw += lambda_reg * w_flattened / n

    db = -np.mean(y_train_np - y_hat)

    return dw, db

In [110]:
def calculate_metrics(y_true, y_pred):
    true_positives = sum((y_pred == 1) & (y_true == 1))
    true_negatives = sum((y_pred == 0) & (y_true == 0))
    false_positives = sum((y_pred == 1) & (y_true == 0))
    false_negatives = sum((y_pred == 0) & (y_true == 1))

    accuracy = (true_positives + true_negatives) / len(y_true)
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return accuracy, precision, recall, f1

In [118]:
#Mini batch gradient descent
def mini_batch(X_train, y_train,lambda_reg):
    eta = 0.1  # Learning rate
    n_epochs = 1000
    m = X_train.shape[0]  # Number of instances
    #lambda_reg = 1  # Regularization parameter
    batch_size = 32  # Mini-batch size

    # Randomly initialize model parameters
    np.random.seed(42)
    w = np.random.randn(X_train.shape[1], 1)  # Initialize weights (excluding bias)
    b = np.random.randn()  # Initialize bias

    for epoch in range(n_epochs):
        shuffled_indices = np.random.permutation(m)
        X_train_shuffled = X_train[shuffled_indices]
        y_train_shuffled = y_train.iloc[shuffled_indices]


        for i in range(0, m, batch_size):
            xi = X_train_shuffled[i:i+batch_size]
            yi = y_train_shuffled[i:i+batch_size]

            # Matrix multiplication (ensure xi and w are compatible)
            # If w is dense and xi is sparse, you may need to adjust the operation
            preds = sigmoid(xi.dot(w) + b)  # Use dot for sparse-dense multiplication

            dw, db = compute_gradients(xi, yi, preds, w, lambda_reg)

            # Update weights and bias
            w -= eta * dw.reshape(w.shape)
            b -= eta * db

            val_preds = sigmoid(X_val.dot(w) + b).flatten()
            val_preds_binary = (val_preds >= 0.5).astype(int)  # Convert probabilities to binary predictions

            # Calculate metrics
            accuracy, precision, recall, f1 = calculate_metrics(y_val, val_preds_binary)

            # Compute loss on validation set
            val_loss = compute_loss(y_val, sigmoid(X_val.dot(w) + b), w, lambda_reg)

            test_preds = sigmoid(X_test.dot(w) + b).flatten()
            test_preds_binary = (test_preds >= 0.5).astype(int)  # Convert probabilities to binary predictions

            # Calculate metrics
            accuracy1, precision1, recall1, f1_1 = calculate_metrics(y_test, test_preds_binary)

            # Compute loss on validation set
            test_loss = compute_loss(y_test, sigmoid(X_test.dot(w) + b), w, lambda_reg)
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Validation Loss: {val_loss}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Test Loss: {test_loss}, Accuracy: {accuracy1}, Precision: {precision1}, Recall: {recall1}, F1 Score: {f1_1}")
            
    return accuracy, precision, recall, f1

        

In [107]:
# Stochastic gradient descent
n_epochs = 1000  # Number of epochs
m = X_train.shape[0]  # Number of instances
lambda_reg = 1  # Regularization parameter

# Randomly initialize model parameters
np.random.seed(42)
w = np.random.randn(X_train.shape[1], 1)  # Initialize weights
b = np.random.randn()  # Initialize bias

for epoch in range(n_epochs):
    shuffled_indices = np.random.permutation(m)
    X_train_shuffled = X_train[shuffled_indices]
    y_train_shuffled = y_train.iloc[shuffled_indices]

    for i in range(m):
        xi = X_train_shuffled[i:i+1]  # Selecting one instance
        yi = y_train_shuffled[i:i+1]

        # Compute predictions
        preds = sigmoid(xi.dot(w) + b)

        # Compute gradients
        dw, db = compute_gradients(xi, yi, preds, w, lambda_reg)

        # Update weights and bias
        w -= eta * dw.reshape(w.shape)
        b -= eta * db

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {compute_loss(y_train_shuffled, sigmoid(X_train_shuffled.dot(w) + b), w, lambda_reg)}")


Epoch 0, Loss: 0.39292883560295316
Epoch 20, Loss: 0.3927423981298147
Epoch 40, Loss: 0.3966798034430748
Epoch 60, Loss: 0.39081547925175697
Epoch 80, Loss: 0.39159573829601896


In [None]:
# Define a range of lambda values to test
lambda_values = [0.01, 0.1, 1, 10, 100]

# Number of folds for cross-validation
n_folds = 5

# Calculate the size of each fold
fold_size = X_train.shape[0] // n_folds

# Split the training data indices into k-folds
folds = [range(i * fold_size, (i + 1) * fold_size) for i in range(n_folds)]

# Initialize dictionary to store the average performance for each lambda
performance_dict = {lmbd: [] for lmbd in lambda_values}

# Perform k-fold cross-validation
for lambda_reg in lambda_values:
    for fold in range(n_folds):
        # Determine the indices for this fold
        val_indices = folds[fold]
        train_indices = [i for i in range(X_train.shape[0]) if i not in val_indices]

        # Extract training and validation sets from the indices
        X_train_fold = X_train[train_indices]
        y_train_fold = y_train.iloc[train_indices]
        X_val_fold = X_train[val_indices]
        y_val_fold = y_train.iloc[val_indices]

        # Train your model here with the given lambda_reg
        model = mini_batch(X_train_fold, y_train_fold, lambda_reg)

    
        # Store the performance metric
        performance_dict[lambda_reg].append(model)

# Calculate the average performance for each lambda
average_performance = {lmbd: sum(perfs) / len(perfs) for lmbd, perfs in performance_dict.items()}

# Select the best lambda
best_lambda = min(average_performance, key=average_performance.get)

# Retrain model on the entire training set using the best lambda
final_model = mini_batch(X_train, y_train, best_lambda)

# Evaluate the final model on the test set
final_model


Epoch 0, Validation Loss: 0.5009536514732899, Accuracy: 0.8161434977578476, Precision: 0.06896551724137931, Recall: 0.03508771929824561, F1 Score: 0.046511627906976744
Epoch 0, Test Loss: 0.5096430340899001, Accuracy: 0.8260089686098655, Precision: 0.17647058823529413, Recall: 0.055900621118012424, F1 Score: 0.08490566037735849
Epoch 100, Validation Loss: 0.4631354336943751, Accuracy: 0.8968609865470852, Precision: 0.9230769230769231, Recall: 0.21052631578947367, F1 Score: 0.34285714285714286
Epoch 100, Test Loss: 0.514611551776682, Accuracy: 0.9022421524663677, Precision: 0.9333333333333333, Recall: 0.34782608695652173, F1 Score: 0.5067873303167421
Epoch 200, Validation Loss: 0.47798501820272005, Accuracy: 0.9282511210762332, Precision: 0.9807692307692307, Recall: 0.4473684210526316, F1 Score: 0.6144578313253012
Epoch 200, Test Loss: 0.5419545920071235, Accuracy: 0.9318385650224216, Precision: 0.956989247311828, Recall: 0.5527950310559007, F1 Score: 0.7007874015748031
Epoch 300, Valid

Epoch 600, Validation Loss: 0.4914201408740213, Accuracy: 0.9405829596412556, Precision: 0.9841269841269841, Recall: 0.543859649122807, F1 Score: 0.7005649717514124
Epoch 600, Test Loss: 0.556135085643123, Accuracy: 0.9533632286995516, Precision: 0.9739130434782609, Recall: 0.6956521739130435, F1 Score: 0.8115942028985508
Epoch 700, Validation Loss: 0.491582544338867, Accuracy: 0.9405829596412556, Precision: 0.9841269841269841, Recall: 0.543859649122807, F1 Score: 0.7005649717514124
Epoch 700, Test Loss: 0.5558880692351652, Accuracy: 0.9515695067264573, Precision: 0.9734513274336283, Recall: 0.6832298136645962, F1 Score: 0.8029197080291971
Epoch 800, Validation Loss: 0.4916401485332831, Accuracy: 0.9405829596412556, Precision: 0.9841269841269841, Recall: 0.543859649122807, F1 Score: 0.7005649717514124
Epoch 800, Test Loss: 0.5556311695408721, Accuracy: 0.9497757847533632, Precision: 0.972972972972973, Recall: 0.6708074534161491, F1 Score: 0.7941176470588235
Epoch 900, Validation Loss: 

Epoch 200, Validation Loss: 0.39111041790355633, Accuracy: 0.8721973094170403, Precision: 0, Recall: 0.0, F1 Score: 0
Epoch 200, Test Loss: 0.4247450366545694, Accuracy: 0.8556053811659193, Precision: 0, Recall: 0.0, F1 Score: 0
Epoch 300, Validation Loss: 0.39095934181335307, Accuracy: 0.8721973094170403, Precision: 0, Recall: 0.0, F1 Score: 0
Epoch 300, Test Loss: 0.42501906213118035, Accuracy: 0.8556053811659193, Precision: 0, Recall: 0.0, F1 Score: 0
Epoch 400, Validation Loss: 0.3909869036193078, Accuracy: 0.8721973094170403, Precision: 0, Recall: 0.0, F1 Score: 0
Epoch 400, Test Loss: 0.42498819593226217, Accuracy: 0.8556053811659193, Precision: 0, Recall: 0.0, F1 Score: 0
Epoch 500, Validation Loss: 0.39102941035992084, Accuracy: 0.8721973094170403, Precision: 0, Recall: 0.0, F1 Score: 0
Epoch 500, Test Loss: 0.42484602823163103, Accuracy: 0.8556053811659193, Precision: 0, Recall: 0.0, F1 Score: 0
Epoch 600, Validation Loss: 0.39099117672704825, Accuracy: 0.8721973094170403, Pre

### Evaluation

#### Introduction
This section of the report evaluates the performance of a text classification model, specifically designed to distinguish between different categories of text data (such as 'spam' vs. 'ham' in emails). The model's efficacy was assessed based on various metrics, including accuracy, precision, recall, F1 score, and loss. A key aspect of this evaluation involved tuning the regularization parameter, \(\lambda\), to optimize model performance.

#### Methodology
The model can be trained using a stochastic gradient descent algorithm or a mini batch gradient descent algorithm. To combat potential overfitting and improve generalization, a regularization term parameterized by \(\lambda\) was introduced. The optimal value for \(\lambda\) was determined through a manual implementation of k-fold cross-validation, given the constraint of not using external libraries like scikit-learn for this purpose.

#### Results
Initial results showed a significant decrease in both validation and test loss after the first 20 epochs, stabilizing thereafter. However, while accuracy remained high, the precision, recall, and F1 scores were low, particularly in identifying the minority class in our dataset. This suggests a model bias towards the majority class.

Post tuning, the model with the optimal \(\lambda\) showed a slight improvement in balancing the recall and precision, indicating a better handling of the class imbalance. However, the F1 scores remained lower than desired, suggesting room for further model refinement.

#### Challenges
Key challenges in this process included:
- Handling class imbalance, which skewed initial model performance metrics.
- Implementing manual cross-validation, which is more prone to errors and requires careful handling of data splits.
- The limitation of not using advanced libraries like scikit-learn, which meant more complex tasks had to be implemented manually.

### Conclusion

#### Summary
The exercise demonstrated the importance and impact of regularization in text classification models. While the model achieved high accuracy, it initially struggled with precision and recall, likely due to class imbalance. The manual tuning of the regularization parameter \(\lambda\) through cross-validation helped improve the model's ability to generalize, though not to the desired extent.

#### Learnings
This task highlighted the critical nature of data preprocessing, the need for robust evaluation metrics in the presence of class imbalances, and the complexities involved in manual model tuning.

#### Resources
handbook:
Hands-On Machine Learning with Scikit-Learn, keras and TensorFlow by Aurelien Geron 
