In [1]:
#import packages for data preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


#import packages for tokenization
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [4]:
data = []

# Open the file in read mode
with open('books.txt', 'r',  encoding='utf-8') as file:
    # Read the contents of the file
    for line in file:
        # Parse the line (by tab)
        parsed_line = line.strip().split('\t')
        
        # Add the parsed data to the list
        data.append(parsed_line)

# Create a DataFrame from the list
df_raw = pd.DataFrame(data, columns=['label', 'text']) 


In [5]:
df = df_raw.copy()

In [7]:
df.head(30)

Unnamed: 0,label,text
0,Jane Austen,﻿PERSUASION
1,Jane Austen,by Jane Austen
2,Jane Austen,(1818)
3,Jane Austen,Chapter 1
4,Jane Austen,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
5,Jane Austen,"""ELLIOT OF KELLYNCH HALL."
6,Jane Austen,"""Walter Elliot, born March 1, 1760, married, J..."
7,Jane Austen,Precisely such had the paragraph originally st...
8,Jane Austen,Then followed the history and rise of the anci...
9,Jane Austen,"""Heir presumptive, William Walter Elliot, Esq...."


In [8]:
unique_values = df_raw['label'].unique()
unique_values

array(['Jane Austen', 'Arthur Conan Doyle', 'Fyodor Dostoyevsky'],
      dtype=object)

In [9]:
# Remove punctuation and numbers
df['text'] = df['text'].str.replace(r"[^a-zA-Z' ]", '', regex=True)

# Convert text to lower case
df['text'] = df['text'].str.lower()

In [10]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scoop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\scoop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Tokenization function
def tokenize_text(text):
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in stemmed_tokens if word not in stop_words]
    return filtered_tokens

In [19]:
df = pd.get_dummies(df, columns = ['label']) 
print(df)

                                                    text  \
0                                             persuasion   
1                                         by jane austen   
3                                               chapter    
4      sir walter elliot of kellynch hall in somerset...   
5                                elliot of kellynch hall   
...                                                  ...   
19531  certainly we shall all rise again certainly we...   
19532        ah how splendid it will be broke from kolya   
19533  well now we will finish talking and go to his ...   
19534  and always so all our lives hand in hand hurra...   
19535                                            the end   

       label_Arthur Conan Doyle  label_Fyodor Dostoyevsky  label_Jane Austen  
0                             0                         0                  1  
1                             0                         0                  1  
3                             0           

In [20]:
df.head(10)

Unnamed: 0,text,label_Arthur Conan Doyle,label_Fyodor Dostoyevsky,label_Jane Austen
0,persuasion,0,0,1
1,by jane austen,0,0,1
3,chapter,0,0,1
4,sir walter elliot of kellynch hall in somerset...,0,0,1
5,elliot of kellynch hall,0,0,1
6,walter elliot born march married july eliz...,0,0,1
7,precisely such had the paragraph originally st...,0,0,1
8,then followed the history and rise of the anci...,0,0,1
9,heir presumptive william walter elliot esq gre...,0,0,1
10,vanity was the beginning and the end of sir wa...,0,0,1


In [21]:
# Check if any row has a null value
rows_with_null = df.isnull().any(axis=1)

# Display rows with null values
print(df[rows_with_null])

Empty DataFrame
Columns: [text, label_Arthur Conan Doyle, label_Fyodor Dostoyevsky, label_Jane Austen]
Index: []


In [17]:
df.shape

(19536, 2)

In [22]:
# Check if entries in the column are either empty strings, null, or whitespace
no_text = df['text'].apply(lambda x: pd.isna(x) or str(x).strip() == '')

# Display rows where the column does not have text
print(df[no_text])

Empty DataFrame
Columns: [text, label_Arthur Conan Doyle, label_Fyodor Dostoyevsky, label_Jane Austen]
Index: []


In [18]:
# Drop these rows from the DataFrame
df= df.drop(df[no_text].index)


(19510, 2)

In [24]:
df.head()

Unnamed: 0,text,label_Arthur Conan Doyle,label_Fyodor Dostoyevsky,label_Jane Austen
0,persuasion,0,0,1
1,by jane austen,0,0,1
3,chapter,0,0,1
4,sir walter elliot of kellynch hall in somerset...,0,0,1
5,elliot of kellynch hall,0,0,1


In [31]:
y = df[['label_Arthur Conan Doyle', 'label_Fyodor Dostoyevsky', 'label_Jane Austen']]
X = df['text']
X

0                                               persuasion
1                                           by jane austen
3                                                 chapter 
4        sir walter elliot of kellynch hall in somerset...
5                                  elliot of kellynch hall
                               ...                        
19531    certainly we shall all rise again certainly we...
19532          ah how splendid it will be broke from kolya
19533    well now we will finish talking and go to his ...
19534    and always so all our lives hand in hand hurra...
19535                                              the end
Name: text, Length: 19510, dtype: object

In [32]:
# Create the TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize_text, preprocessor=lambda x: x, lowercase=False)

# Fit the vectorizer to the training data
X_tfidf = vectorizer.fit_transform(X)



In [33]:
X_tfidf.shape

(19510, 19279)

In [34]:
# Splitting the dataset into the Training set and Test set
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size = 0.2, random_state = 42)

# Splitting the Training set further into Training and Validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.2, random_state = 42) 

In [35]:
np.random.seed(42)

# Assuming X_tfidf is your feature matrix
d = X_tfidf.shape[1]  # Number of features
n_classes = 3  # Number of classes (adjust as needed)

In [36]:
# Initialize weights and bias
w = np.random.randn(d, n_classes)  
b = np.random.randn(n_classes)

In [37]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)


In [38]:
def compute_loss(y, y_hat):
    # Assuming y is one-hot encoded
    return -np.mean(np.sum(y * np.log(y_hat + 1e-15), axis=1))

In [45]:
def compute_gradients(X, y, y_hat, w, lambda_reg):
    n = X.shape[0]
    dw = X.T.dot(y_hat - y) / n + lambda_reg * w / n  # Add the regularization term
    db = np.mean(y_hat - y, axis=0)
    return dw, db

In [40]:
def predict(X, w, b):
    z = X.dot(w) + b
    return softmax(z)


In [53]:
eta = 0.1  # Learning rate
n_epochs = 100
m = X_train.shape[0]  # Number of instances
batch_size = 12  # Mini-batch size
lambda_reg = 1   # Regularization parameter

# Randomly initialize model parameters
np.random.seed(42)
w = np.random.randn(d, n_classes)  # where n_classes is the number of classes
b = np.random.randn(n_classes)


for epoch in range(n_epochs):
    shuffled_indices = np.random.permutation(m)
    X_train_shuffled = X_train[shuffled_indices]
    y_train_shuffled = y_train.iloc[shuffled_indices]


    for i in range(0, m, batch_size):
        xi = X_train_shuffled[i:i+batch_size]
        yi = y_train_shuffled[i:i+batch_size]

        # Matrix multiplication (ensure xi and w are compatible)
        # If w is dense and xi is sparse, you may need to adjust the operation
        preds = softmax(xi.dot(w) + b)  # Use dot for sparse-dense multiplication
  

        dw, db = compute_gradients(xi, yi, preds, w, lambda_reg)

        # Update weights and bias
        w -= eta * dw.reshape(w.shape)
        b -= eta * db
        
        
        # Convert X_val and y_val to numpy arrays if they are pandas objects
        X_val_np = X_val.to_numpy() if isinstance(X_val, (pd.DataFrame, pd.Series)) else X_val
        y_val_np = y_val.to_numpy() if isinstance(y_val, (pd.DataFrame, pd.Series)) else y_val
        
        # Convert X_val and y_val to numpy arrays if they are pandas objects
        X_test_np = X_test.to_numpy() if isinstance(X_test, (pd.DataFrame, pd.Series)) else X_test
        y_test_np = y_test.to_numpy() if isinstance(y_test, (pd.DataFrame, pd.Series)) else y_test

        # Perform the matrix multiplication and addition
        z = np.dot(X_val_np, w) + b
        val_preds = softmax(z)

        # val_preds = softmax(X_val.dot(w) + b)
        val_preds_ = np.argmax(val_preds, axis=1)  

        # Compute loss on validation set
        val_loss = compute_loss(y_val, softmax(X_val.dot(w) + b))
        
        # Perform the matrix multiplication and addition
        z1 = np.dot(X_test_np, w) + b
        test_preds = softmax(z1)

        # test prediction
        test_preds_ = np.argmax(test_preds, axis=1)  

        # Compute loss on validation set
        test_loss = compute_loss(y_test, softmax(X_test.dot(w) + b))
        
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Validation Loss: {val_loss}")

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Test Loss: {test_loss}")

KeyboardInterrupt: 

In [48]:
X_val.shape

(3122, 19279)

In [49]:
w.shape

(19279, 3)