EECS 4412 Project - Phase II
Maryam Salarian
Analysis of the dataset + preprocessing + ....

In [4]:
# 1. load the dataset

import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
reviews = pd.read_csv("/content/drive/MyDrive/EECS4412/data/project/Books_rating.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 2. display the dataset to get an idea

reviews.head(10)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...
5,826414346,Dr. Seuss: American Icon,,A2F6NONFUDB6UK,Malvin,2/2,4.0,1127174400,One of America's greatest creative talents,"""Dr. Seuss: American Icon"" by Philip Nel is a ..."
6,826414346,Dr. Seuss: American Icon,,A14OJS0VWMOSWO,Midwest Book Review,3/4,5.0,1100131200,A memorably excellent survey of Dr. Seuss' man...,Theodor Seuss Giesel was best known as 'Dr. Se...
7,826414346,Dr. Seuss: American Icon,,A2RSSXTDZDUSH4,J. Squire,0/0,5.0,1231200000,Academia At It's Best,When I recieved this book as a gift for Christ...
8,826414346,Dr. Seuss: American Icon,,A25MD5I2GUIW6W,"J. P. HIGBED ""big fellow""",0/0,5.0,1209859200,And to think that I read it on the tram!,Trams (or any public transport) are not usuall...
9,826414346,Dr. Seuss: American Icon,,A3VA4XFS5WNJO3,Donald Burnside,3/5,4.0,1076371200,Fascinating account of a genius at work,"As far as I am aware, this is the first book-l..."


In [None]:
# 3. as displayed, some of the books have multiple reviews and ratings
# handling approach: treat each review as a separate sample, and once sentiment assigned, take majority vote
# alternative to majority vote: take weighted sentiment, using review/helpfulness as weight, and assigning small weight to those with review/helpfulness = 0.
# size of the unaltered dataset
print("size of unaltered reviews dataset:", reviews.shape)

# checking data types, as non-numeric data type for rating will affect preprocessing
print ("\ndata type of attributes:\n", reviews.dtypes)

# rename relevant attrs for ease of reference: Title to title, review/score to rating, review/helpfulness to helpfulness, review/text to review
reviews.rename(columns={'Title':'title', 'review/score': 'rating', 'review/helpfulness': 'helpfulness', 'review/text':'review'}, inplace=True)

# drop all other cols
columns_to_keep = ["title", "rating", "helpfulness", "review"]
reviews = reviews[columns_to_keep]
print("size of dataset after dropping 6 out of 10 attributes:", reviews.shape)

# check the max and min value for rating (review/score) in the dataset
print("\nmin rating:", reviews["rating"].min())
print("max rating:", reviews["rating"].max())


size of unaltered reviews dataset: (3000000, 10)

data type of attributes:
 Id                     object
Title                  object
Price                 float64
User_id                object
profileName            object
review/helpfulness     object
review/score          float64
review/time             int64
review/summary         object
review/text            object
dtype: object
size of dataset after dropping 6 out of 10 attributes: (3000000, 4)

min rating: 1.0
max rating: 5.0


In [None]:
# 4. add new col, sentiment, based on review rating
# will be added to the entire dataset as target attribute
# rating >= 4 is positive: +1
# rating =< 2 is negative: -1
# rating =3 is netural: 0

# perform rating-specific preprocessing:

# a) convert rating to numreic -> from above data types we conclude that rating is already in numeric format: float64
# b) remove rows with missing or NaN rating
reviews = reviews.dropna(subset=["rating"])
# c) ensure all ratings are within range -> from above rating range, we confirm that all ratings are within [1,5]

# generate the new sentiment column
def assign_sentiment(x):
    if x >= 4:
        return 1
    elif x == 3:
        return 0
    else:
        return -1

# apply the above function to every value in the rating col, and store result in new col
reviews["sentiment"] = reviews["rating"].apply(assign_sentiment)

# convert helpfulness from object to numeric values, fill NaN with 0
def fraction_to_float(x):
    if isinstance(x, str) and "/" in x:
        try:
            num, denom = x.split("/")
            return float(num) / float(denom)
        except:
            return np.nan  # invalid fraction
    else:
        # try to convert directly to float
        return pd.to_numeric(x, errors='coerce')
reviews["helpfulness"] = reviews["helpfulness"].apply(fraction_to_float)
reviews["helpfulness"] = reviews["helpfulness"].fillna(0)

print("size of dataset after removing NaN ratings and adding new col:", reviews.shape)
reviews.head(5)


size of dataset after removing NaN ratings and adding new col: (3000000, 5)


Unnamed: 0,title,rating,helpfulness,review,sentiment
0,Its Only Art If Its Well Hung!,4.0,1.0,This is only for Julie Strain fans. It's a col...,1
1,Dr. Seuss: American Icon,5.0,1.0,I don't care much for Dr. Seuss but after read...,1
2,Dr. Seuss: American Icon,5.0,0.909091,"If people become the books they read and if ""t...",1
3,Dr. Seuss: American Icon,4.0,1.0,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",1
4,Dr. Seuss: American Icon,4.0,1.0,Philip Nel - Dr. Seuss: American IconThis is b...,1


In [None]:
# 5. perform further preprocessing on the dataset
import re

# a) remove rows with missig reviews: NaN, '', ""
# drop rows where review is NaN
reviews = reviews.dropna(subset=["review"])
# remove empty strings or whitespace-only reviews
df = reviews[reviews["review"].str.strip().astype(bool)]
print("size of dataset after removing NaN or empty reviews:", reviews.shape)

# b) remove identical rows with duplicate review per book
reviews = reviews.drop_duplicates(subset=["title", "review"])
print("size of dataset after dropping duplicates:", reviews.shape)

# c) replace english contractions with full form: won't -> will not
contraction_map = {
    "can't": "can not",
    "won't": "will not",
    "n't": " not",
    "'re": " are",
    "'s": " is",
    "'d": " would",
    "'ll": " will",
    "'t": " not",
    "'ve": " have",
    "'m": " am"
}
# pattern = re.compile("({})".format("|".join(map(re.escape, contraction_map.keys()))))
# def expand_contractions_fast(text: str) -> str:
#     if not isinstance(text, str):
#         return text
#     return pattern.sub(lambda m: contraction_map[m.group(0)], text)
# chunksize = 50000
# for start in range(0, len(reviews), chunksize):
#     end = start + chunksize
#     reviews.loc[start:end, "review"] = (
#         reviews.loc[start:end, "review"].apply(expand_contractions_fast)
#     )

def expand_contractions_vectorized(series):
    for k, v in contraction_map.items():
        series = series.str.replace(k, v, regex=True)
    return series

reviews["review"] = expand_contractions_vectorized(reviews["review"])
print("contraction expansion completed.")


size of dataset after removing NaN or empty reviews: (2999992, 5)
size of dataset after dropping duplicates: (2616740, 5)
contraction expansion completed.


In [None]:
import re

# d) to prevent overfitting and reduce noise, remove reviews with < 2 words
reviews = reviews[reviews["review"].str.len() >= 2]
print("reviews with less than 2  words dropped.")

# e) lower case all characters in review
reviews["review"] = reviews["review"].str.lower()
print("all chars lowercased.")

# # f) remove leading and trailing white spaces + normalize space between chars
# reviews["review"] = reviews["review"].str.replace(r"\s+", " ", regex=True).str.strip()
# print("white spaces normalized.")

# # g) remove non-text/num chars, keep the spaces
# reviews["review"] = reviews["review"].str.replace(r"[^a-z0-9\s']", " ", regex=True)
# print("punctuation removal completed.")

clean_pattern = re.compile(r"[^a-z0-9\s']+")

def fast_clean(text: str) -> str:
    if not isinstance(text, str):
        return text
    # 1. Remove unwanted characters (punctuation, symbols)
    text = clean_pattern.sub(" ", text)
    # 2. Normalize whitespace (no need for regex here)
    text = " ".join(text.split())
    return text

chunksize = 50000
for start in range(0, len(reviews), chunksize):
    end = start + chunksize
    reviews.loc[start:end, "review"] = (
        reviews.loc[start:end, "review"].apply(fast_clean)
    )

reviews with less than 2  words dropped.
all chars lowercased.


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pickle
import tqdm

In [None]:
# 6. perfrom text - order them appropriately
# updated logic to use PorterStemmer > deletef after laptop auto reboot
# downloaded the result to /content/drive/MyDrive/EECS4412/data/project/reviews_cleaned.csv

# tokenize
def tokenize(text):
    return token_pattern.findall(text.lower())

# remove stopwords
stop_words = set(stopwords.words('english'))
negation_words = {"not", "no", "never", "none"}
stop_words -= negation_words
def remove_stopwords(tokens):
    return [t for t in tokens if t not in stop_words]

# define stemmer and find word stems
stemmer = PorterStemmer()
def stem_tokens(tokens):
    return [stemmer.stem(t) for t in tokens]

# put all steps together
def preprocess_text(text):
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    # return string for TF-IDF
    return " ".join(tokens)


# put steps together:
reviews.to_csv("/content/drive/MyDrive/EECS4412/data/project/reviews_cleaned.csv", index=False)

with open("/content/reviews_cleaned.pkl", "wb") as f:
    pickle.dump(reviews, f)

reviews.head(5)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


KeyboardInterrupt: 

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
reviews_cleaned = pd.read_csv("/content/drive/MyDrive/EECS4412/data/project/reviews_cleaned.csv")
reviews_cleaned.head(5)

Unnamed: 0,title,rating,helpfulness,review,sentiment
0,Its Only Art If Its Well Hung!,4.0,1.0,juli strain fan collect photo page worth nice ...,1
1,Dr. Seuss: American Icon,5.0,1.0,not care much dr seuss read philip nel book ch...,1
2,Dr. Seuss: American Icon,5.0,0.909091,peopl becom book read child father man dr seus...,1
3,Dr. Seuss: American Icon,4.0,1.0,theodor seuss geisel aka quot dr seuss quot on...,1
4,Dr. Seuss: American Icon,4.0,1.0,philip nel dr seuss american iconthi basic aca...,1


In [34]:
reviews_cleaned = reviews_cleaned.dropna(subset=["review"]).reset_index(drop=True)
# reviews_cleaned.reset_index(drop=True, inplace=True)

reviews_cleaned.shape

(2616702, 5)

In [35]:
# remove NaNs and rewrite to the same file
reviews_cleaned.to_csv(
    "/content/drive/MyDrive/EECS4412/data/project/reviews_cleaned.csv",
    index=False
)
print("reviews_cleaned.csv overwritten with NaN-free version.")

reviews_cleaned.csv overwritten with NaN-free version.


In [29]:
# to check the balance of dataset > very imbalanced. ~80% of the data has positive sentiment

reviews_cleaned['sentiment'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
sentiment,Unnamed: 1_level_1
1,0.79535
-1,0.119504
0,0.085146


In [7]:
print(reviews_cleaned.shape)
print(reviews_cleaned["review"].isna().sum())

(2616702, 5)
0


In [31]:
# upscaling the minority class to overcome imbalance

from sklearn.utils import resample

df_majority = reviews_cleaned[reviews_cleaned.sentiment == 1]
df_minority_neg = reviews_cleaned[reviews_cleaned.sentiment == -1]
df_minority_neu = reviews_cleaned[reviews_cleaned.sentiment == 0]

df_minority_neg_upsampled = resample(
    df_minority_neg, replace=True, n_samples=len(df_majority), random_state=42
)
df_minority_neu_upsampled = resample(
    df_minority_neu, replace=True, n_samples=len(df_majority), random_state=42
)

balanced_df = pd.concat([df_majority, df_minority_neg_upsampled, df_minority_neu_upsampled])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

balanced_df = balanced_df.dropna(subset=["review"]).reset_index(drop=True)

print("balanced_df shape:", balanced_df.shape)
print(balanced_df["sentiment"].value_counts())


balanced_df shape: (6243579, 5)
sentiment
 0    2081193
 1    2081193
-1    2081193
Name: count, dtype: int64


In [32]:
balanced_df.to_csv("/content/drive/MyDrive/EECS4412/data/project/reviews_balanced.csv", index=False)
print("reviews_balanced.csv created!")

reviews_balanced.csv created!


In [33]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
import joblib

DTYPE = np.float32
PROJECT_PATH = "/content/drive/MyDrive/EECS4412/data/project/"
NGRAM_RANGE = (1,2)
MAX_FEATURES = 150_000
SAMPLE_SIZE = 500_000

# sample 150k reviews to fit the vectorizer
# balanced_df = pd.read_csv("/content/drive/MyDrive/EECS4412/data/project/reviews_balanced.csv")
sample = balanced_df.sample(SAMPLE_SIZE, random_state=42)["review"].astype(str).tolist()

vectorizer = CountVectorizer(
    ngram_range=NGRAM_RANGE,
    binary=True,
    dtype=DTYPE,
    max_features=MAX_FEATURES
)

vectorizer.fit(sample)
vocab = vectorizer.get_feature_names_out()
print(f"Learned vocab size: {len(vocab)}")
joblib.dump(vectorizer, PROJECT_PATH + "vectorizer.pkl")

reviews_list = balanced_df["review"].astype(str).tolist()
total_reviews = len(reviews_list)
print("total reviews:", total_reviews)


Learned vocab size: 150000
total reviews: 6243579


In [35]:
# stream DF counting in chunks
import numpy as np
import gc
from scipy.sparse import csr_matrix
from tqdm import tqdm

CHUNK_SIZE = 100_000

total_reviews = len(balanced_df)
df_counts = np.zeros(len(vocab), dtype=DTYPE)

for start in tqdm(range(0, total_reviews, CHUNK_SIZE), desc="Counting DF"):
    end = min(start + CHUNK_SIZE, total_reviews)
    batch = balanced_df.iloc[start:end]["review"].astype(str).tolist()

    # transform batch into sparse matrix
    X_chunk = vectorizer.transform(batch)  # sparse CSR matrix
    # sum over rows (axis=0) -> number of docs each token appears in
    df_chunk = X_chunk.sign().sum(axis=0).A1 # memory efficient
    df_counts += df_chunk

    # free memory
    del X_chunk, df_chunk, batch
    gc.collect()

np.save(PROJECT_PATH + "df_counts.npy", df_counts)
print("Document frequency counting complete!")

Counting DF: 100%|██████████| 63/63 [22:56<00:00, 21.85s/it]

Document frequency counting complete!





In [36]:
# apply DF thresholds and reduce vocab

N = len(balanced_df)
MIN_DF_PROP = 1e-4
MAX_DF_PROP = 0.95
MAX_KEEP = 50_000

min_df_cutoff = int(N * MIN_DF_PROP)
max_df_cutoff = int(N * MAX_DF_PROP)

# boolean array for vocabs
mask = (df_counts >= min_df_cutoff) & (df_counts <= max_df_cutoff)
indices_kept = np.where(mask)[0]

reduced_vocab = vocab[indices_kept]

# keep top MAX_KEEP by DF if necessary
if len(reduced_vocab) > MAX_KEEP:
    sorted_idx = np.argsort(df_counts[indices_kept])[::-1]
    top_idx = sorted_idx[:MAX_KEEP]
    indices_kept = indices_kept[top_idx]
    reduced_vocab = vocab[indices_kept]

np.save(PROJECT_PATH + "reduced_vocab.npy", reduced_vocab)
print("Reduced vocab size:", len(reduced_vocab))

Reduced vocab size: 50000


In [37]:
# build TF-IDF vectorizer with reduced vocab
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
import joblib

PROJECT_PATH = "/content/drive/MyDrive/EECS4412/data/project/"
NGRAM_RANGE = (1,2)
SAMPLE_SIZE = 300_000

# load reduced vocab
reduced_vocab = np.load(PROJECT_PATH + "reduced_vocab.npy", allow_pickle=True)

tfidf = TfidfVectorizer(
    ngram_range=NGRAM_RANGE,
    vocabulary={t: i for i, t in enumerate(reduced_vocab)},
    # reduce impact of very frequent words, used instead of raw frequency
    sublinear_tf=True,
    # L2 norm, prevents longer docs from dominating
    norm='l2',
    token_pattern=r"[a-zA-Z]+(?:'[a-z]+)?"
)

sample_texts = balanced_df.sample(SAMPLE_SIZE, random_state=42)['review'].astype(str).tolist()
print("Sample text creted!")

tfidf.fit(sample_texts)
joblib.dump(tfidf, PROJECT_PATH + "tfidf_reduced_vocab.pkl")
print("TF-IDF computed on reduced vocab!")

Sample text creted!
TF-IDF computed on reduced vocab!


In [39]:
# apply TF-IDF to the dataset and save in chunks

from scipy.sparse import save_npz, vstack
from joblib import load

N = len(balanced_df)
CHUNK_SIZE = 100_000

# load tfidf
tfidf = load(PROJECT_PATH + "tfidf_reduced_vocab.pkl")

tfidf_chunk_files = []

for start in tqdm(range(0, N, CHUNK_SIZE), desc="TF-IDF transform"):
    end = min(start + CHUNK_SIZE, N)
    batch = balanced_df.iloc[start:end]['review'].astype(str).tolist()
    X_chunk = tfidf.transform(batch)
    fname = PROJECT_PATH + f"tfidf_chunk_{start}_{end}.npz"
    save_npz(fname, X_chunk.astype(np.float32))
    tfidf_chunk_files.append(fname)

print("TF-IDF transformation complete!")

TF-IDF transform: 100%|██████████| 63/63 [24:41<00:00, 23.52s/it]

TF-IDF transformation complete!





In [7]:
# stratified sampling for chi sqaure - supervised feature selection

from sklearn.model_selection import train_test_split
from joblib import load

PROJECT_PATH = "/content/drive/MyDrive/EECS4412/data/project/"
balanced_df = pd.read_csv(PROJECT_PATH + "reviews_balanced.csv")
tfidf = load(PROJECT_PATH + "tfidf_reduced_vocab.pkl")

# sample only 5% of the dataset
X_sample, _, y_sample, _ = train_test_split(
    balanced_df['review'],
    balanced_df['sentiment'],
    stratify=balanced_df['sentiment'],
    test_size=0.95,
    random_state=42
)

X_sample_tfidf = tfidf.transform(X_sample)
print("TF-IDF transformation on sample completed!")


TF-IDF transformation on sample completed!


In [41]:
  # apply chi square

from sklearn.feature_selection import SelectKBest, chi2

FINAL_K = 20_000

selector = SelectKBest(chi2, k=FINAL_K)
selector.fit(X_sample_tfidf, y_sample)
selected_indices = selector.get_support(indices=True)
final_features = reduced_vocab[selected_indices]

np.save(PROJECT_PATH + "final_vocab.npy", final_features)
joblib.dump(selector, PROJECT_PATH + "chi2_selector.pkl")
print("Chi square feature selection completed. Final features:", len(final_features))

Chi square feature selection completed. Final features: 20000


In [42]:
# reduce full TF-IDF chunks to chi-square-selected features

from scipy import sparse
from scipy.sparse import save_npz, vstack, load_npz
from tqdm import tqdm

final_chunk_files = []

for fname in tqdm(tfidf_chunk_files, desc="Applying final features to all chunks"):
    X = sparse.load_npz(fname)
    X_selected = X[:, selected_indices]
    out_fname = fname.replace("tfidf_chunk_", "final_chunk_")
    save_npz(out_fname, X_selected)
    final_chunk_files.append(out_fname)

Applying final features to all chunks: 100%|██████████| 63/63 [06:55<00:00,  6.60s/it]


In [None]:
# after last step, each chunk file contains (100k samples) x (20k features) sparse TF-IDF matrix
# now ready for model training

In [None]:
# Logistic Regression for 3-Class Problems
# 2 options:
# 1. Softmax Regression
# 2. One-vs-Rest: less accurate but simpler

In [8]:
# keep all the classes in a separate file - same order as the full dataset
import pandas as pd
import numpy as np
balanced_df = pd.read_csv(PROJECT_PATH + "reviews_balanced.csv")
y_full = balanced_df["sentiment"].astype(int).to_numpy()
np.save(PROJECT_PATH + "y_full.npy", y_full)

print("y_full saved. Shape:", y_full.shape)

y_full saved. Shape: (6243579,)


In [1]:
import numpy as np
from scipy.sparse import load_npz, csr_matrix
import os, glob, math, time
from tqdm import tqdm

# configs
PROJECT_PATH = "/content/drive/MyDrive/EECS4412/data/project/"
FINAL_CHUNKS_GLOB = PROJECT_PATH + "final_chunk_*.npz"
Y_FILE = PROJECT_PATH + "y_full.npy"
FINAL_VOCAB_FILE = PROJECT_PATH + "final_vocab.npy"

# load final vocab
final_vocab = np.load(FINAL_VOCAB_FILE, allow_pickle=True)
D = len(final_vocab)
print("D (feature dimension) =", D)

y_full = np.load(Y_FILE)   # shape (N,)
N = y_full.shape[0]
print("Total training samples:", N)

# precompute mapping of chunk files in sorted order
chunk_files = sorted(glob.glob(FINAL_CHUNKS_GLOB))
if len(chunk_files) == 0:
    raise RuntimeError("No chunk files found: " + FINAL_CHUNKS_GLOB)

chunk_row_counts = []
for fname in chunk_files:
    tfidfMtrix_temp = load_npz(fname)
    chunk_row_counts.append(tfidfMtrix_temp.shape[0])
    del tfidfMtrix_temp

# cumulative row indexing
chum_rows = np.cumsum([0] + chunk_row_counts)
print("Found", len(chunk_files), "chunks.")

D (feature dimension) = 20000
Total training samples: 6243579
Found 63 chunks.


In [2]:
# hyper-params
import re

NUM_CLASSES = 3
EPOCHS = 5
BATCH_SIZE = 2048        # minibatch rows (tune to RAM)
LR = 0.5                 # initial learning rate; tune (e.g. 0.1 - 1.0)
L2 = 1e-4                # regularization strength
MOMENTUM = 0.9           # 0 for none
SEED = 42
DTYPE = np.float32

use_adam = False

# utils
def softmax_rows(z):
    z_max = np.max(z, axis=1, keepdims=True)
    e = np.exp(z - z_max) # subtract max for stability
    s = np.sum(e, axis=1, keepdims=True)
    return e / s

# def labels_for_chunk(i):
#     start = chum_rows[i]
#     end = chum_rows[i+1]
#     return y_full[start:end]
def get_labels_from_filename(fname, y_full):
    m = re.search(r"(\d+)_(\d+)\.npz$", fname)
    if m is None:
        raise ValueError(f"Chunk filename format incorrect: {fname}")
    start = int(m.group(1))
    end = int(m.group(2))
    return y_full[start:end]

def one_hot(y, k):
    n = y.shape[0]
    oh = np.zeros((n, k), dtype=DTYPE)
    oh[np.arange(n), y] = 1.0
    return oh

In [3]:
# initialize Logisitc Regression model

rng = np.random.default_rng(SEED)

W = rng.normal(0, 0.01, size=(D, NUM_CLASSES)).astype(DTYPE)
b = np.zeros(NUM_CLASSES, dtype=DTYPE)

velocity_W = np.zeros_like(W)
velocity_b = np.zeros_like(b)


In [4]:
# train
import gc

for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    epoch_loss = 0
    seen = 0

    # shuffle chunk order each epoch
    rng.shuffle(chunk_files)

    # iterate chunks in order
    for i, fname in enumerate(tqdm(chunk_files, desc=f"Epoch {epoch}")):
        X_chunk = load_npz(fname).tocsr()
        # y_chunk = labels_for_chunk(i)
        y_chunk = get_labels_from_filename(fname, y_full)
        print(i, X_chunk.shape[0], y_chunk.shape[0])

        m = X_chunk.shape[0]
        seen += m

        idx = np.arange(m)
        rng.shuffle(idx)
        X_chunk = X_chunk[idx]
        y_chunk = y_chunk[idx]

        # minibatch within chunk
        for start in range(0, m, BATCH_SIZE):
            end = min(start + BATCH_SIZE, m)
            Xb = X_chunk[start:end] # sparse CSR (nb, D)
            yb = y_chunk[start:end] # (nb,)
            nb = yb.shape[0]

            # compute scores z = Xb.dot(W) + b -> (nb, K)
            z = Xb.dot(W) + b # dense (nb, K)

            # probabilities
            p = softmax_rows(z) # (nb, K)

            # cross-entropy, loss
            loss = -np.log(p[np.arange(nb), yb] + 1e-12).sum()
            epoch_loss += loss

            # gradients
            # gradient of loss w.r.t. z: G = p; G[range(nb), y] -= 1
            G = p
            G[np.arange(nb), yb] -= 1
            G /= nb #  average over batch

            # gradient w.r.t. W: dW = Xb.T @ G + L2 * W
            # Xb.T @ G  -> (D, K) dense
            dW = Xb.T.dot(G) + L2 * W
            db = G.sum(axis=0) # (K,)

            # SGD update using momentum
            velocity_W = MOMENTUM * velocity_W - LR * dW
            W += velocity_W

            velocity_b = MOMENTUM * velocity_b - LR * db
            b += velocity_b

            del Xb, yb, z, p, G, dW

        # delete chunk after all minibatches processed to free up memory
        del X_chunk, y_chunk
        gc.collect()

    print(f"Epoch {epoch}: avg loss = {epoch_loss/seen:.6f}, time = {time.time()-t0:.1f}s")

    # save checkpoint
    np.save(PROJECT_PATH + f"W_epoch{epoch}.npy", W)
    np.save(PROJECT_PATH + f"b_epoch{epoch}.npy", b)
    print("Checkpoint saved.")

Epoch 1:   0%|          | 0/63 [00:00<?, ?it/s]

0 100000 100000


Epoch 1:   2%|▏         | 1/63 [00:01<01:19,  1.29s/it]

1 100000 100000


Epoch 1:   3%|▎         | 2/63 [00:02<00:59,  1.02it/s]

2 100000 100000


Epoch 1:   5%|▍         | 3/63 [00:02<00:52,  1.14it/s]

3 100000 100000


Epoch 1:   8%|▊         | 5/63 [00:03<00:36,  1.57it/s]

4 43579 43579
5 100000 100000


Epoch 1:  10%|▉         | 6/63 [00:04<00:37,  1.53it/s]

6 100000 100000


Epoch 1:  11%|█         | 7/63 [00:05<00:37,  1.50it/s]

7 100000 100000


Epoch 1:  13%|█▎        | 8/63 [00:06<00:40,  1.35it/s]

8 100000 100000


Epoch 1:  14%|█▍        | 9/63 [00:07<00:43,  1.24it/s]

9 100000 100000


Epoch 1:  16%|█▌        | 10/63 [00:08<00:44,  1.18it/s]

10 100000 100000


Epoch 1:  17%|█▋        | 11/63 [00:08<00:45,  1.13it/s]

11 100000 100000


Epoch 1:  19%|█▉        | 12/63 [00:09<00:42,  1.19it/s]

12 100000 100000


Epoch 1:  21%|██        | 13/63 [00:10<00:39,  1.26it/s]

13 100000 100000


Epoch 1:  22%|██▏       | 14/63 [00:11<00:37,  1.32it/s]

14 100000 100000


Epoch 1:  24%|██▍       | 15/63 [00:11<00:35,  1.37it/s]

15 100000 100000


Epoch 1:  25%|██▌       | 16/63 [00:12<00:33,  1.39it/s]

16 100000 100000


Epoch 1:  27%|██▋       | 17/63 [00:13<00:33,  1.39it/s]

17 100000 100000


Epoch 1:  29%|██▊       | 18/63 [00:13<00:32,  1.37it/s]

18 100000 100000


Epoch 1:  30%|███       | 19/63 [00:14<00:31,  1.40it/s]

19 100000 100000


Epoch 1:  32%|███▏      | 20/63 [00:15<00:30,  1.43it/s]

20 100000 100000


Epoch 1:  33%|███▎      | 21/63 [00:15<00:29,  1.44it/s]

21 100000 100000


Epoch 1:  35%|███▍      | 22/63 [00:16<00:28,  1.45it/s]

22 100000 100000


Epoch 1:  37%|███▋      | 23/63 [00:17<00:27,  1.47it/s]

23 100000 100000


Epoch 1:  38%|███▊      | 24/63 [00:17<00:26,  1.48it/s]

24 100000 100000


Epoch 1:  40%|███▉      | 25/63 [00:18<00:25,  1.46it/s]

25 100000 100000


Epoch 1:  41%|████▏     | 26/63 [00:19<00:26,  1.38it/s]

26 100000 100000


Epoch 1:  43%|████▎     | 27/63 [00:20<00:28,  1.28it/s]

27 100000 100000


Epoch 1:  44%|████▍     | 28/63 [00:21<00:28,  1.21it/s]

28 100000 100000


Epoch 1:  46%|████▌     | 29/63 [00:22<00:28,  1.17it/s]

29 100000 100000


Epoch 1:  48%|████▊     | 30/63 [00:23<00:28,  1.15it/s]

30 100000 100000


Epoch 1:  49%|████▉     | 31/63 [00:23<00:26,  1.23it/s]

31 100000 100000


Epoch 1:  51%|█████     | 32/63 [00:24<00:24,  1.29it/s]

32 100000 100000


Epoch 1:  52%|█████▏    | 33/63 [00:25<00:22,  1.33it/s]

33 100000 100000


Epoch 1:  54%|█████▍    | 34/63 [00:25<00:21,  1.35it/s]

34 100000 100000


Epoch 1:  56%|█████▌    | 35/63 [00:26<00:20,  1.39it/s]

35 100000 100000


Epoch 1:  57%|█████▋    | 36/63 [00:27<00:19,  1.42it/s]

36 100000 100000


Epoch 1:  59%|█████▊    | 37/63 [00:27<00:17,  1.45it/s]

37 100000 100000


Epoch 1:  60%|██████    | 38/63 [00:28<00:17,  1.46it/s]

38 100000 100000


Epoch 1:  62%|██████▏   | 39/63 [00:29<00:16,  1.46it/s]

39 100000 100000


Epoch 1:  63%|██████▎   | 40/63 [00:29<00:15,  1.47it/s]

40 100000 100000


Epoch 1:  65%|██████▌   | 41/63 [00:30<00:15,  1.44it/s]

41 100000 100000


Epoch 1:  67%|██████▋   | 42/63 [00:31<00:14,  1.44it/s]

42 100000 100000


Epoch 1:  68%|██████▊   | 43/63 [00:32<00:13,  1.45it/s]

43 100000 100000


Epoch 1:  70%|██████▉   | 44/63 [00:32<00:13,  1.45it/s]

44 100000 100000


Epoch 1:  71%|███████▏  | 45/63 [00:33<00:13,  1.36it/s]

45 100000 100000


Epoch 1:  73%|███████▎  | 46/63 [00:34<00:13,  1.27it/s]

46 100000 100000


Epoch 1:  75%|███████▍  | 47/63 [00:35<00:13,  1.20it/s]

47 100000 100000


Epoch 1:  76%|███████▌  | 48/63 [00:36<00:12,  1.16it/s]

48 100000 100000


Epoch 1:  78%|███████▊  | 49/63 [00:37<00:12,  1.13it/s]

49 100000 100000


Epoch 1:  79%|███████▉  | 50/63 [00:37<00:10,  1.20it/s]

50 100000 100000


Epoch 1:  81%|████████  | 51/63 [00:38<00:09,  1.26it/s]

51 100000 100000


Epoch 1:  83%|████████▎ | 52/63 [00:39<00:08,  1.32it/s]

52 100000 100000


Epoch 1:  84%|████████▍ | 53/63 [00:40<00:07,  1.36it/s]

53 100000 100000


Epoch 1:  86%|████████▌ | 54/63 [00:40<00:06,  1.38it/s]

54 100000 100000


Epoch 1:  87%|████████▋ | 55/63 [00:41<00:05,  1.40it/s]

55 100000 100000


Epoch 1:  89%|████████▉ | 56/63 [00:42<00:04,  1.40it/s]

56 100000 100000


Epoch 1:  90%|█████████ | 57/63 [00:42<00:04,  1.41it/s]

57 100000 100000


Epoch 1:  92%|█████████▏| 58/63 [00:43<00:03,  1.42it/s]

58 100000 100000


Epoch 1:  94%|█████████▎| 59/63 [00:44<00:02,  1.43it/s]

59 100000 100000


Epoch 1:  95%|█████████▌| 60/63 [00:44<00:02,  1.43it/s]

60 100000 100000


Epoch 1:  97%|█████████▋| 61/63 [00:45<00:01,  1.42it/s]

61 100000 100000


Epoch 1:  98%|█████████▊| 62/63 [00:46<00:00,  1.40it/s]

62 100000 100000


Epoch 1: 100%|██████████| 63/63 [00:47<00:00,  1.34it/s]


Epoch 1: avg loss = 0.816025, time = 47.1s
Checkpoint saved.


Epoch 2:   0%|          | 0/63 [00:00<?, ?it/s]

0 100000 100000


Epoch 2:   2%|▏         | 1/63 [00:01<01:07,  1.08s/it]

1 100000 100000


Epoch 2:   3%|▎         | 2/63 [00:02<01:02,  1.03s/it]

2 100000 100000


Epoch 2:   5%|▍         | 3/63 [00:02<00:58,  1.02it/s]

3 100000 100000


Epoch 2:   6%|▋         | 4/63 [00:03<00:56,  1.05it/s]

4 100000 100000


Epoch 2:   8%|▊         | 5/63 [00:04<00:51,  1.14it/s]

5 100000 100000


Epoch 2:  10%|▉         | 6/63 [00:05<00:46,  1.23it/s]

6 100000 100000


Epoch 2:  11%|█         | 7/63 [00:06<00:43,  1.30it/s]

7 100000 100000


Epoch 2:  13%|█▎        | 8/63 [00:06<00:41,  1.33it/s]

8 100000 100000


Epoch 2:  14%|█▍        | 9/63 [00:07<00:39,  1.37it/s]

9 100000 100000


Epoch 2:  16%|█▌        | 10/63 [00:08<00:37,  1.40it/s]

10 100000 100000


Epoch 2:  17%|█▋        | 11/63 [00:08<00:37,  1.41it/s]

11 100000 100000


Epoch 2:  19%|█▉        | 12/63 [00:09<00:35,  1.42it/s]

12 100000 100000


Epoch 2:  21%|██        | 13/63 [00:10<00:34,  1.43it/s]

13 100000 100000


Epoch 2:  22%|██▏       | 14/63 [00:10<00:33,  1.45it/s]

14 100000 100000


Epoch 2:  24%|██▍       | 15/63 [00:11<00:33,  1.45it/s]

15 100000 100000


Epoch 2:  25%|██▌       | 16/63 [00:12<00:32,  1.47it/s]

16 100000 100000


Epoch 2:  27%|██▋       | 17/63 [00:12<00:31,  1.48it/s]

17 100000 100000


Epoch 2:  29%|██▊       | 18/63 [00:13<00:30,  1.46it/s]

18 100000 100000


Epoch 2:  30%|███       | 19/63 [00:14<00:30,  1.47it/s]

19 100000 100000


Epoch 2:  32%|███▏      | 20/63 [00:15<00:32,  1.34it/s]

20 100000 100000


Epoch 2:  33%|███▎      | 21/63 [00:16<00:33,  1.25it/s]

21 100000 100000


Epoch 2:  35%|███▍      | 22/63 [00:16<00:34,  1.20it/s]

22 43579 43579


Epoch 2:  37%|███▋      | 23/63 [00:17<00:28,  1.39it/s]

23 100000 100000


Epoch 2:  38%|███▊      | 24/63 [00:18<00:30,  1.28it/s]

24 100000 100000


Epoch 2:  40%|███▉      | 25/63 [00:19<00:28,  1.34it/s]

25 100000 100000


Epoch 2:  41%|████▏     | 26/63 [00:19<00:26,  1.39it/s]

26 100000 100000


Epoch 2:  43%|████▎     | 27/63 [00:20<00:25,  1.42it/s]

27 100000 100000


Epoch 2:  44%|████▍     | 28/63 [00:20<00:24,  1.45it/s]

28 100000 100000


Epoch 2:  46%|████▌     | 29/63 [00:21<00:23,  1.47it/s]

29 100000 100000


Epoch 2:  48%|████▊     | 30/63 [00:22<00:22,  1.48it/s]

30 100000 100000


Epoch 2:  49%|████▉     | 31/63 [00:23<00:21,  1.48it/s]

31 100000 100000


Epoch 2:  51%|█████     | 32/63 [00:23<00:21,  1.47it/s]

32 100000 100000


Epoch 2:  52%|█████▏    | 33/63 [00:24<00:20,  1.44it/s]

33 100000 100000


Epoch 2:  54%|█████▍    | 34/63 [00:25<00:20,  1.43it/s]

34 100000 100000


Epoch 2:  56%|█████▌    | 35/63 [00:25<00:19,  1.44it/s]

35 100000 100000


Epoch 2:  57%|█████▋    | 36/63 [00:26<00:18,  1.46it/s]

36 100000 100000


Epoch 2:  59%|█████▊    | 37/63 [00:27<00:17,  1.48it/s]

37 100000 100000


Epoch 2:  60%|██████    | 38/63 [00:27<00:16,  1.49it/s]

38 100000 100000


Epoch 2:  62%|██████▏   | 39/63 [00:28<00:16,  1.43it/s]

39 100000 100000


Epoch 2:  63%|██████▎   | 40/63 [00:29<00:17,  1.33it/s]

40 100000 100000


Epoch 2:  65%|██████▌   | 41/63 [00:30<00:17,  1.25it/s]

41 100000 100000


Epoch 2:  67%|██████▋   | 42/63 [00:31<00:17,  1.21it/s]

42 100000 100000


Epoch 2:  68%|██████▊   | 43/63 [00:32<00:17,  1.18it/s]

43 100000 100000


Epoch 2:  70%|██████▉   | 44/63 [00:32<00:15,  1.26it/s]

44 100000 100000


Epoch 2:  71%|███████▏  | 45/63 [00:33<00:13,  1.33it/s]

45 100000 100000


Epoch 2:  73%|███████▎  | 46/63 [00:34<00:12,  1.38it/s]

46 100000 100000


Epoch 2:  75%|███████▍  | 47/63 [00:34<00:11,  1.41it/s]

47 100000 100000


Epoch 2:  76%|███████▌  | 48/63 [00:35<00:10,  1.44it/s]

48 100000 100000


Epoch 2:  78%|███████▊  | 49/63 [00:36<00:09,  1.45it/s]

49 100000 100000


Epoch 2:  79%|███████▉  | 50/63 [00:36<00:08,  1.46it/s]

50 100000 100000


Epoch 2:  81%|████████  | 51/63 [00:37<00:08,  1.48it/s]

51 100000 100000


Epoch 2:  83%|████████▎ | 52/63 [00:38<00:07,  1.49it/s]

52 100000 100000


Epoch 2:  84%|████████▍ | 53/63 [00:38<00:06,  1.50it/s]

53 100000 100000


Epoch 2:  86%|████████▌ | 54/63 [00:39<00:05,  1.50it/s]

54 100000 100000


Epoch 2:  87%|████████▋ | 55/63 [00:40<00:05,  1.50it/s]

55 100000 100000


Epoch 2:  89%|████████▉ | 56/63 [00:40<00:04,  1.51it/s]

56 100000 100000


Epoch 2:  90%|█████████ | 57/63 [00:41<00:03,  1.51it/s]

57 100000 100000


Epoch 2:  92%|█████████▏| 58/63 [00:42<00:03,  1.51it/s]

58 100000 100000


Epoch 2:  94%|█████████▎| 59/63 [00:42<00:02,  1.38it/s]

59 100000 100000


Epoch 2:  95%|█████████▌| 60/63 [00:43<00:02,  1.30it/s]

60 100000 100000


Epoch 2:  97%|█████████▋| 61/63 [00:44<00:01,  1.25it/s]

61 100000 100000


Epoch 2:  98%|█████████▊| 62/63 [00:45<00:00,  1.20it/s]

62 100000 100000


Epoch 2: 100%|██████████| 63/63 [00:46<00:00,  1.35it/s]


Epoch 2: avg loss = 0.764701, time = 46.5s
Checkpoint saved.


Epoch 3:   0%|          | 0/63 [00:00<?, ?it/s]

0 100000 100000


Epoch 3:   2%|▏         | 1/63 [00:00<00:43,  1.44it/s]

1 100000 100000


Epoch 3:   3%|▎         | 2/63 [00:01<00:42,  1.44it/s]

2 100000 100000


Epoch 3:   5%|▍         | 3/63 [00:02<00:41,  1.46it/s]

3 100000 100000


Epoch 3:   6%|▋         | 4/63 [00:02<00:40,  1.47it/s]

4 100000 100000


Epoch 3:   8%|▊         | 5/63 [00:03<00:38,  1.49it/s]

5 100000 100000


Epoch 3:  10%|▉         | 6/63 [00:04<00:38,  1.49it/s]

6 100000 100000


Epoch 3:  11%|█         | 7/63 [00:04<00:37,  1.51it/s]

7 100000 100000


Epoch 3:  13%|█▎        | 8/63 [00:05<00:36,  1.51it/s]

8 100000 100000


Epoch 3:  14%|█▍        | 9/63 [00:06<00:35,  1.52it/s]

9 100000 100000


Epoch 3:  16%|█▌        | 10/63 [00:06<00:35,  1.51it/s]

10 100000 100000


Epoch 3:  17%|█▋        | 11/63 [00:07<00:34,  1.52it/s]

11 100000 100000


Epoch 3:  19%|█▉        | 12/63 [00:07<00:33,  1.53it/s]

12 100000 100000


Epoch 3:  21%|██        | 13/63 [00:08<00:32,  1.53it/s]

13 100000 100000


Epoch 3:  22%|██▏       | 14/63 [00:09<00:32,  1.53it/s]

14 100000 100000


Epoch 3:  24%|██▍       | 15/63 [00:10<00:32,  1.47it/s]

15 100000 100000


Epoch 3:  25%|██▌       | 16/63 [00:10<00:34,  1.35it/s]

16 100000 100000


Epoch 3:  27%|██▋       | 17/63 [00:11<00:36,  1.27it/s]

17 100000 100000


Epoch 3:  29%|██▊       | 18/63 [00:12<00:36,  1.22it/s]

18 100000 100000


Epoch 3:  30%|███       | 19/63 [00:13<00:37,  1.19it/s]

19 100000 100000


Epoch 3:  32%|███▏      | 20/63 [00:14<00:34,  1.25it/s]

20 100000 100000


Epoch 3:  33%|███▎      | 21/63 [00:14<00:31,  1.32it/s]

21 100000 100000


Epoch 3:  35%|███▍      | 22/63 [00:15<00:30,  1.36it/s]

22 100000 100000


Epoch 3:  38%|███▊      | 24/63 [00:16<00:23,  1.67it/s]

23 43579 43579
24 100000 100000


Epoch 3:  40%|███▉      | 25/63 [00:17<00:23,  1.63it/s]

25 100000 100000


Epoch 3:  41%|████▏     | 26/63 [00:17<00:23,  1.59it/s]

26 100000 100000


Epoch 3:  43%|████▎     | 27/63 [00:18<00:23,  1.56it/s]

27 100000 100000


Epoch 3:  44%|████▍     | 28/63 [00:19<00:22,  1.55it/s]

28 100000 100000


Epoch 3:  46%|████▌     | 29/63 [00:19<00:22,  1.53it/s]

29 100000 100000


Epoch 3:  48%|████▊     | 30/63 [00:20<00:21,  1.52it/s]

30 100000 100000


Epoch 3:  49%|████▉     | 31/63 [00:21<00:21,  1.52it/s]

31 100000 100000


Epoch 3:  51%|█████     | 32/63 [00:21<00:20,  1.50it/s]

32 100000 100000


Epoch 3:  52%|█████▏    | 33/63 [00:22<00:19,  1.51it/s]

33 100000 100000


Epoch 3:  54%|█████▍    | 34/63 [00:23<00:19,  1.51it/s]

34 100000 100000


Epoch 3:  56%|█████▌    | 35/63 [00:23<00:19,  1.47it/s]

35 100000 100000


Epoch 3:  57%|█████▋    | 36/63 [00:24<00:20,  1.33it/s]

36 100000 100000


Epoch 3:  59%|█████▊    | 37/63 [00:25<00:21,  1.22it/s]

37 100000 100000


Epoch 3:  60%|██████    | 38/63 [00:26<00:21,  1.17it/s]

38 100000 100000


Epoch 3:  62%|██████▏   | 39/63 [00:27<00:21,  1.14it/s]

39 100000 100000


Epoch 3:  63%|██████▎   | 40/63 [00:28<00:19,  1.18it/s]

40 100000 100000


Epoch 3:  65%|██████▌   | 41/63 [00:29<00:17,  1.26it/s]

41 100000 100000


Epoch 3:  67%|██████▋   | 42/63 [00:29<00:15,  1.32it/s]

42 100000 100000


Epoch 3:  68%|██████▊   | 43/63 [00:30<00:14,  1.37it/s]

43 100000 100000


Epoch 3:  70%|██████▉   | 44/63 [00:31<00:13,  1.41it/s]

44 100000 100000


Epoch 3:  71%|███████▏  | 45/63 [00:31<00:12,  1.45it/s]

45 100000 100000


Epoch 3:  73%|███████▎  | 46/63 [00:32<00:11,  1.48it/s]

46 100000 100000


Epoch 3:  75%|███████▍  | 47/63 [00:33<00:10,  1.49it/s]

47 100000 100000


Epoch 3:  76%|███████▌  | 48/63 [00:33<00:10,  1.49it/s]

48 100000 100000


Epoch 3:  78%|███████▊  | 49/63 [00:34<00:09,  1.51it/s]

49 100000 100000


Epoch 3:  79%|███████▉  | 50/63 [00:35<00:08,  1.51it/s]

50 100000 100000


Epoch 3:  81%|████████  | 51/63 [00:35<00:07,  1.51it/s]

51 100000 100000


Epoch 3:  83%|████████▎ | 52/63 [00:36<00:07,  1.51it/s]

52 100000 100000


Epoch 3:  84%|████████▍ | 53/63 [00:37<00:06,  1.51it/s]

53 100000 100000


Epoch 3:  86%|████████▌ | 54/63 [00:37<00:05,  1.51it/s]

54 100000 100000


Epoch 3:  87%|████████▋ | 55/63 [00:38<00:05,  1.44it/s]

55 100000 100000


Epoch 3:  89%|████████▉ | 56/63 [00:39<00:05,  1.31it/s]

56 100000 100000


Epoch 3:  90%|█████████ | 57/63 [00:40<00:04,  1.25it/s]

57 100000 100000


Epoch 3:  92%|█████████▏| 58/63 [00:41<00:04,  1.20it/s]

58 100000 100000


Epoch 3:  94%|█████████▎| 59/63 [00:42<00:03,  1.18it/s]

59 100000 100000


Epoch 3:  95%|█████████▌| 60/63 [00:42<00:02,  1.27it/s]

60 100000 100000


Epoch 3:  97%|█████████▋| 61/63 [00:43<00:01,  1.34it/s]

61 100000 100000


Epoch 3:  98%|█████████▊| 62/63 [00:44<00:00,  1.39it/s]

62 100000 100000


Epoch 3: 100%|██████████| 63/63 [00:44<00:00,  1.41it/s]


Epoch 3: avg loss = 0.762648, time = 44.8s
Checkpoint saved.


Epoch 4:   0%|          | 0/63 [00:00<?, ?it/s]

0 100000 100000


Epoch 4:   2%|▏         | 1/63 [00:00<00:41,  1.50it/s]

1 100000 100000


Epoch 4:   3%|▎         | 2/63 [00:01<00:40,  1.52it/s]

2 100000 100000


Epoch 4:   5%|▍         | 3/63 [00:01<00:39,  1.52it/s]

3 100000 100000


Epoch 4:   6%|▋         | 4/63 [00:02<00:39,  1.49it/s]

4 100000 100000


Epoch 4:   8%|▊         | 5/63 [00:03<00:39,  1.49it/s]

5 100000 100000


Epoch 4:  10%|▉         | 6/63 [00:04<00:39,  1.45it/s]

6 100000 100000


Epoch 4:  11%|█         | 7/63 [00:04<00:38,  1.45it/s]

7 100000 100000


Epoch 4:  13%|█▎        | 8/63 [00:05<00:37,  1.46it/s]

8 100000 100000


Epoch 4:  14%|█▍        | 9/63 [00:06<00:37,  1.45it/s]

9 100000 100000


Epoch 4:  16%|█▌        | 10/63 [00:06<00:36,  1.46it/s]

10 100000 100000


Epoch 4:  17%|█▋        | 11/63 [00:07<00:36,  1.44it/s]

11 100000 100000


Epoch 4:  19%|█▉        | 12/63 [00:08<00:38,  1.34it/s]

12 100000 100000


Epoch 4:  21%|██        | 13/63 [00:09<00:39,  1.27it/s]

13 100000 100000


Epoch 4:  22%|██▏       | 14/63 [00:10<00:40,  1.21it/s]

14 100000 100000


Epoch 4:  24%|██▍       | 15/63 [00:11<00:40,  1.18it/s]

15 100000 100000


Epoch 4:  25%|██▌       | 16/63 [00:11<00:37,  1.25it/s]

16 100000 100000


Epoch 4:  27%|██▋       | 17/63 [00:12<00:34,  1.32it/s]

17 100000 100000


Epoch 4:  29%|██▊       | 18/63 [00:13<00:32,  1.39it/s]

18 100000 100000


Epoch 4:  30%|███       | 19/63 [00:13<00:30,  1.42it/s]

19 100000 100000


Epoch 4:  32%|███▏      | 20/63 [00:14<00:29,  1.45it/s]

20 100000 100000


Epoch 4:  33%|███▎      | 21/63 [00:15<00:28,  1.48it/s]

21 100000 100000


Epoch 4:  35%|███▍      | 22/63 [00:15<00:27,  1.47it/s]

22 100000 100000


Epoch 4:  37%|███▋      | 23/63 [00:16<00:26,  1.48it/s]

23 100000 100000


Epoch 4:  38%|███▊      | 24/63 [00:17<00:25,  1.50it/s]

24 100000 100000


Epoch 4:  40%|███▉      | 25/63 [00:17<00:25,  1.51it/s]

25 100000 100000


Epoch 4:  41%|████▏     | 26/63 [00:18<00:24,  1.52it/s]

26 100000 100000


Epoch 4:  43%|████▎     | 27/63 [00:19<00:24,  1.48it/s]

27 100000 100000


Epoch 4:  44%|████▍     | 28/63 [00:19<00:23,  1.49it/s]

28 100000 100000


Epoch 4:  46%|████▌     | 29/63 [00:20<00:22,  1.51it/s]

29 100000 100000


Epoch 4:  48%|████▊     | 30/63 [00:21<00:21,  1.51it/s]

30 100000 100000


Epoch 4:  49%|████▉     | 31/63 [00:21<00:22,  1.41it/s]

31 100000 100000


Epoch 4:  51%|█████     | 32/63 [00:22<00:23,  1.31it/s]

32 100000 100000


Epoch 4:  52%|█████▏    | 33/63 [00:23<00:23,  1.26it/s]

33 100000 100000


Epoch 4:  54%|█████▍    | 34/63 [00:24<00:24,  1.20it/s]

34 100000 100000


Epoch 4:  56%|█████▌    | 35/63 [00:25<00:23,  1.20it/s]

35 100000 100000


Epoch 4:  57%|█████▋    | 36/63 [00:25<00:21,  1.28it/s]

36 100000 100000


Epoch 4:  59%|█████▊    | 37/63 [00:26<00:19,  1.35it/s]

37 100000 100000


Epoch 4:  60%|██████    | 38/63 [00:27<00:17,  1.39it/s]

38 100000 100000


Epoch 4:  62%|██████▏   | 39/63 [00:27<00:16,  1.41it/s]

39 100000 100000


Epoch 4:  63%|██████▎   | 40/63 [00:28<00:15,  1.46it/s]

40 100000 100000


Epoch 4:  65%|██████▌   | 41/63 [00:29<00:15,  1.46it/s]

41 100000 100000


Epoch 4:  67%|██████▋   | 42/63 [00:29<00:14,  1.47it/s]

42 100000 100000


Epoch 4:  68%|██████▊   | 43/63 [00:30<00:13,  1.49it/s]

43 100000 100000


Epoch 4:  70%|██████▉   | 44/63 [00:31<00:12,  1.50it/s]

44 100000 100000


Epoch 4:  71%|███████▏  | 45/63 [00:31<00:11,  1.51it/s]

45 100000 100000


Epoch 4:  73%|███████▎  | 46/63 [00:32<00:11,  1.52it/s]

46 100000 100000


Epoch 4:  75%|███████▍  | 47/63 [00:33<00:10,  1.53it/s]

47 100000 100000


Epoch 4:  76%|███████▌  | 48/63 [00:33<00:09,  1.54it/s]

48 100000 100000


Epoch 4:  78%|███████▊  | 49/63 [00:34<00:09,  1.55it/s]

49 100000 100000


Epoch 4:  79%|███████▉  | 50/63 [00:35<00:08,  1.53it/s]

50 100000 100000


Epoch 4:  81%|████████  | 51/63 [00:36<00:08,  1.37it/s]

51 100000 100000


Epoch 4:  83%|████████▎ | 52/63 [00:36<00:08,  1.28it/s]

52 100000 100000


Epoch 4:  84%|████████▍ | 53/63 [00:37<00:08,  1.22it/s]

53 100000 100000


Epoch 4:  86%|████████▌ | 54/63 [00:38<00:07,  1.18it/s]

54 100000 100000


Epoch 4:  87%|████████▋ | 55/63 [00:39<00:06,  1.24it/s]

55 100000 100000


Epoch 4:  89%|████████▉ | 56/63 [00:40<00:05,  1.32it/s]

56 100000 100000


Epoch 4:  90%|█████████ | 57/63 [00:40<00:04,  1.37it/s]

57 100000 100000


Epoch 4:  92%|█████████▏| 58/63 [00:41<00:03,  1.41it/s]

58 100000 100000


Epoch 4:  94%|█████████▎| 59/63 [00:42<00:02,  1.44it/s]

59 100000 100000


Epoch 4:  95%|█████████▌| 60/63 [00:42<00:02,  1.46it/s]

60 100000 100000


Epoch 4:  98%|█████████▊| 62/63 [00:43<00:00,  1.73it/s]

61 43579 43579
62 100000 100000


Epoch 4: 100%|██████████| 63/63 [00:44<00:00,  1.42it/s]


Epoch 4: avg loss = 0.762428, time = 44.5s
Checkpoint saved.


Epoch 5:   0%|          | 0/63 [00:00<?, ?it/s]

0 100000 100000


Epoch 5:   2%|▏         | 1/63 [00:00<00:41,  1.49it/s]

1 100000 100000


Epoch 5:   3%|▎         | 2/63 [00:01<00:41,  1.47it/s]

2 100000 100000


Epoch 5:   5%|▍         | 3/63 [00:02<00:40,  1.49it/s]

3 100000 100000


Epoch 5:   6%|▋         | 4/63 [00:02<00:39,  1.50it/s]

4 100000 100000


Epoch 5:   8%|▊         | 5/63 [00:03<00:38,  1.51it/s]

5 100000 100000


Epoch 5:  10%|▉         | 6/63 [00:03<00:37,  1.51it/s]

6 100000 100000


Epoch 5:  11%|█         | 7/63 [00:04<00:38,  1.47it/s]

7 100000 100000


Epoch 5:  13%|█▎        | 8/63 [00:05<00:41,  1.34it/s]

8 100000 100000


Epoch 5:  14%|█▍        | 9/63 [00:06<00:42,  1.26it/s]

9 100000 100000


Epoch 5:  16%|█▌        | 10/63 [00:07<00:43,  1.22it/s]

10 100000 100000


Epoch 5:  17%|█▋        | 11/63 [00:08<00:43,  1.19it/s]

11 100000 100000


Epoch 5:  19%|█▉        | 12/63 [00:08<00:40,  1.25it/s]

12 100000 100000


Epoch 5:  21%|██        | 13/63 [00:09<00:38,  1.31it/s]

13 100000 100000


Epoch 5:  22%|██▏       | 14/63 [00:10<00:36,  1.36it/s]

14 100000 100000


Epoch 5:  24%|██▍       | 15/63 [00:10<00:34,  1.39it/s]

15 100000 100000


Epoch 5:  25%|██▌       | 16/63 [00:11<00:33,  1.42it/s]

16 100000 100000


Epoch 5:  27%|██▋       | 17/63 [00:12<00:31,  1.44it/s]

17 100000 100000


Epoch 5:  29%|██▊       | 18/63 [00:13<00:30,  1.46it/s]

18 100000 100000


Epoch 5:  30%|███       | 19/63 [00:13<00:29,  1.47it/s]

19 100000 100000


Epoch 5:  32%|███▏      | 20/63 [00:14<00:29,  1.48it/s]

20 100000 100000


Epoch 5:  33%|███▎      | 21/63 [00:15<00:28,  1.47it/s]

21 100000 100000


Epoch 5:  35%|███▍      | 22/63 [00:15<00:27,  1.48it/s]

22 100000 100000


Epoch 5:  38%|███▊      | 24/63 [00:16<00:22,  1.74it/s]

23 43579 43579
24 100000 100000


Epoch 5:  40%|███▉      | 25/63 [00:17<00:23,  1.65it/s]

25 100000 100000


Epoch 5:  41%|████▏     | 26/63 [00:18<00:23,  1.59it/s]

26 100000 100000


Epoch 5:  43%|████▎     | 27/63 [00:18<00:24,  1.49it/s]

27 100000 100000


Epoch 5:  44%|████▍     | 28/63 [00:19<00:25,  1.35it/s]

28 100000 100000


Epoch 5:  46%|████▌     | 29/63 [00:20<00:26,  1.27it/s]

29 100000 100000


Epoch 5:  48%|████▊     | 30/63 [00:21<00:26,  1.23it/s]

30 100000 100000


Epoch 5:  49%|████▉     | 31/63 [00:22<00:26,  1.19it/s]

31 100000 100000


Epoch 5:  51%|█████     | 32/63 [00:23<00:24,  1.28it/s]

32 100000 100000


Epoch 5:  52%|█████▏    | 33/63 [00:23<00:22,  1.34it/s]

33 100000 100000


Epoch 5:  54%|█████▍    | 34/63 [00:24<00:20,  1.38it/s]

34 100000 100000


Epoch 5:  56%|█████▌    | 35/63 [00:25<00:19,  1.43it/s]

35 100000 100000


Epoch 5:  57%|█████▋    | 36/63 [00:25<00:18,  1.44it/s]

36 100000 100000


Epoch 5:  59%|█████▊    | 37/63 [00:26<00:17,  1.46it/s]

37 100000 100000


Epoch 5:  60%|██████    | 38/63 [00:27<00:17,  1.46it/s]

38 100000 100000


Epoch 5:  62%|██████▏   | 39/63 [00:27<00:16,  1.47it/s]

39 100000 100000


Epoch 5:  63%|██████▎   | 40/63 [00:28<00:15,  1.47it/s]

40 100000 100000


Epoch 5:  65%|██████▌   | 41/63 [00:29<00:14,  1.47it/s]

41 100000 100000


Epoch 5:  67%|██████▋   | 42/63 [00:29<00:14,  1.49it/s]

42 100000 100000


Epoch 5:  68%|██████▊   | 43/63 [00:30<00:13,  1.49it/s]

43 100000 100000


Epoch 5:  70%|██████▉   | 44/63 [00:31<00:12,  1.49it/s]

44 100000 100000


Epoch 5:  71%|███████▏  | 45/63 [00:31<00:12,  1.48it/s]

45 100000 100000


Epoch 5:  73%|███████▎  | 46/63 [00:32<00:11,  1.50it/s]

46 100000 100000


Epoch 5:  75%|███████▍  | 47/63 [00:33<00:11,  1.35it/s]

47 100000 100000


Epoch 5:  76%|███████▌  | 48/63 [00:34<00:11,  1.27it/s]

48 100000 100000


Epoch 5:  78%|███████▊  | 49/63 [00:35<00:11,  1.21it/s]

49 100000 100000


Epoch 5:  79%|███████▉  | 50/63 [00:36<00:11,  1.17it/s]

50 100000 100000


Epoch 5:  81%|████████  | 51/63 [00:36<00:09,  1.22it/s]

51 100000 100000


Epoch 5:  83%|████████▎ | 52/63 [00:37<00:08,  1.29it/s]

52 100000 100000


Epoch 5:  84%|████████▍ | 53/63 [00:38<00:07,  1.34it/s]

53 100000 100000


Epoch 5:  86%|████████▌ | 54/63 [00:38<00:06,  1.38it/s]

54 100000 100000


Epoch 5:  87%|████████▋ | 55/63 [00:39<00:05,  1.40it/s]

55 100000 100000


Epoch 5:  89%|████████▉ | 56/63 [00:40<00:04,  1.40it/s]

56 100000 100000


Epoch 5:  90%|█████████ | 57/63 [00:40<00:04,  1.42it/s]

57 100000 100000


Epoch 5:  92%|█████████▏| 58/63 [00:41<00:03,  1.43it/s]

58 100000 100000


Epoch 5:  94%|█████████▎| 59/63 [00:42<00:02,  1.44it/s]

59 100000 100000


Epoch 5:  95%|█████████▌| 60/63 [00:42<00:02,  1.45it/s]

60 100000 100000


Epoch 5:  97%|█████████▋| 61/63 [00:43<00:01,  1.47it/s]

61 100000 100000


Epoch 5:  98%|█████████▊| 62/63 [00:44<00:00,  1.48it/s]

62 100000 100000


Epoch 5: 100%|██████████| 63/63 [00:44<00:00,  1.40it/s]

Epoch 5: avg loss = 0.762370, time = 45.0s
Checkpoint saved.





In [None]:
# use a small representative sample for hyperparameter tuning (e.g., 5–10% of the dataset).
# train the model from scratch on that sample for a few epochs.
# keep a small separate test set to evaluate model after tuning.
# monitor metrics: validation accuracy, cross-entropy loss.
# select best hyperparameters and then scale up to full dataset.
# solution: possibly use a tuning loop with different hyperparams

In [7]:
# stratified sample
import pandas as pd

from sklearn.model_selection import train_test_split

PROJECT_PATH = "/content/drive/MyDrive/EECS4412/data/project/"
balanced_df = pd.read_csv(PROJECT_PATH + "reviews_balanced.csv")
y_full = balanced_df["sentiment"].astype(int).to_numpy() # .astype(np.int32)

SAMPLE_RATIO = 0.05
MAX_SAMPLE = 150_000
N = len(balanced_df)

sample_size = min(int(N * SAMPLE_RATIO), MAX_SAMPLE)
print("Target sample size =", sample_size)
sample_fraction = sample_size / N
print("Target sample fraction =", sample_fraction)

# create stratified sample
_, sample_idx = train_test_split(
    np.arange(N), # , dtype=np.int32
    test_size=sample_fraction,
    stratify=y_full,
    random_state=42
)

sample_idx = np.sort(sample_idx)
print("Sampled rows =", len(sample_idx))

Target sample size = 150000
Target sample fraction = 0.024024681997296744
Sampled rows = 150000


In [8]:
# extract TF-IDF rows from chunk files - load only the required rows

from scipy.sparse import vstack, csr_matrix
import gc

sample_matrices = []
current_start = 0

itr = 0

for fname in tqdm(chunk_files, desc="Extracting sample rows"):
    X_chunk = load_npz(fname).tocsr()
    m = X_chunk.shape[0]

    # rows of sample_idx that belong to this chunk
    mask = (sample_idx >= current_start) & (sample_idx < current_start + m)
    local_rows = sample_idx[mask] - current_start

    if len(local_rows) > 0:
        sample_matrices.append(X_chunk[local_rows])

    del X_chunk
    gc.collect()

    current_start += m

# combine
X_sample = vstack(sample_matrices).tocsr()
y_sample = y_full[sample_idx]

print("X_sample shape =", X_sample.shape)
print("y_sample shape =", y_sample.shape)

# save sample for reuse
from scipy.sparse import save_npz
save_npz(PROJECT_PATH + "X_sample.npz", X_sample)
np.save(PROJECT_PATH + "y_sample.npy", y_sample)

print("Sample saved!")

Extracting sample rows: 100%|██████████| 63/63 [00:34<00:00,  1.82it/s]


X_sample shape = (150000, 20000)
y_sample shape = (150000,)
Sample saved!


In [9]:
# split sample into train/validation
from scipy.sparse import save_npz, vstack, load_npz

X_sample = load_npz(PROJECT_PATH + "X_sample.npz")
y_sample = np.load(PROJECT_PATH + "y_sample.npy")

X_train, X_val, y_train, y_val = train_test_split(
    X_sample, y_sample,
    test_size=0.2,
    stratify=y_sample,
    random_state=42
)

print("Train =", X_train.shape, "Val =", X_val.shape)

Train = (120000, 20000) Val = (30000, 20000)


In [10]:
# initialize params for tuning

def init_params(D, K, seed=42):
    rng = np.random.default_rng(seed)
    W = rng.normal(0, 0.01, size=(D, K))
    b = np.zeros(K)
    return W, b

In [11]:
# operates only on X_train / y_train using SGD + momentum

import time
from sklearn.metrics import f1_score, accuracy_score

# accuracy is not the best measure, given that our dataset is imbalanced.
# 80% accuracy if 80% of the dataset belongs to one class, is not a good classification

# cfg: dict containing hyperparameters
def train_one_config(cfg, X_train, y_train, X_val, y_val, verbose=False):
    LR = cfg['lr']
    BATCH = cfg['batch']
    L2 = cfg['l2']
    OPT = cfg['opt']         # 'adam' or 'sgd'
    MOM = cfg['momentum']
    EPOCHS = cfg['epochs']

    K = 3  # 3 sentiment classes
    W, b = init_params(D, K, seed=cfg['seed'])

    # initialize velocities for SGD + momentum
    velocity_W = np.zeros_like(W)
    velocity_b = np.zeros_like(b)

    # Adam parameters
    # mW = np.zeros_like(W)
    # vW = np.zeros_like(W)
    # mb = np.zeros_like(b)
    # vb = np.zeros_like(b)
    # beta1 = 0.9
    # beta2 = 0.999
    # eps = 1e-8
    # t = 0

    history = []
    best_val_loss = float("inf")
    patience = cfg.get("patience", 1)
    patience_counter = 0

    for ep in range(EPOCHS):
        t0 = time.time()

        idx = np.arange(len(y_train))
        np.random.shuffle(idx)

        Xtr = X_train[idx]
        ytr = y_train[idx]

        losses = []
        # minibatches
        for start in range(0, len(ytr), BATCH):
            end = min(start + BATCH, len(ytr))
            Xb = Xtr[start:end]
            yb = ytr[start:end]
            nb = len(yb)

            # forward
            z = Xb.dot(W) + b
            p = softmax_rows(z)
            loss = -np.log(p[np.arange(nb), yb] + 1e-12).mean()
            losses.append(loss)

            # gradients
            G = p
            G[np.arange(nb), yb] -= 1
            G /= nb

            dW = Xb.T.dot(G) + L2 * W
            db = G.sum(axis=0)

            # update
            if OPT == "sgd":
                velocity_W = MOM * velocity_W - LR * dW
                W += velocity_W

                velocity_b = MOM * velocity_b - LR * db
                b += velocity_b

            # else:  # ADAM
            #     t += 1
            #     mW = beta1*mW + (1-beta1)*dW
            #     vW = beta2*vW + (1-beta2)*(dW*dW)
            #     mb = beta1*mb + (1-beta1)*db
            #     vb = beta2*vb + (1-beta2)*(db*db)

            #     mW_hat = mW / (1 - beta1**t)
            #     vW_hat = vW / (1 - beta2**t)
            #     mb_hat = mb / (1 - beta1**t)
            #     vb_hat = vb / (1 - beta2**t)

            #     W -= LR * mW_hat / (np.sqrt(vW_hat) + eps)
            #     b -= LR * mb_hat / (np.sqrt(vb_hat) + eps)

        # compute val metrics
        z_val = X_val.dot(W) + b
        p_val = softmax_rows(z_val)
        val_loss = -np.log(p_val[np.arange(len(y_val)), y_val] + 1e-12).mean()
        y_pred = p_val.argmax(axis=1)
        val_f1 = f1_score(y_val, y_pred, average='macro')
        # val_acc = accuracy_score(y_val, y_pred)
        history.append((val_loss, val_f1))

        if verbose:
            print(f"Epoch {ep+1}: ValLoss={val_loss:.4f}, ValF1={val_f1:.4f}, time={time.time()-t0:.1f}s")

        # early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter > patience:
                if verbose:
                    print("Early stopping triggered.")
                break

    return {
        'val_loss': best_val_loss,
        'val_f1': val_f1,
        # "val_acc": val_acc,
        'W': W,
        'b': b,
        'history': history
    }

In [12]:
# define search space and random search loop

import random

def sample_hyperparams():
    return {
        "opt": "sgd",
        "lr": random.choice([1e-4, 3e-4, 1e-3, 3e-3, 1e-2]),
        "l2": random.choice([1e-5, 1e-4, 1e-3, 1e-2]),
        "batch": random.choice([1024, 2048, 4096]),
        "momentum": random.choice([0.0, 0.8, 0.9]),
        "epochs": 3,
        "seed": random.randint(1, 10_000),
        "patience": 1
        # highlight: initialize velocities only when training starts, not here
        # "velocity_W": np.zeros((D, 3), dtype=np.float32),
        # "velocity_b": np.zeros(3, dtype=np.float32)
    }

In [13]:
# run random search
import joblib

NUM_TRIALS = 30
results = []

for t in range(NUM_TRIALS):
    print(f"\n Trial {t+1}/{NUM_TRIALS}")
    cfg = sample_hyperparams()

    # velocities are runtime satate, no need to save
    save_cfg = {
        k: v for k, v in cfg.items()
        if k not in ["velocity_W", "velocity_b"]
    }

    res = train_one_config(
        cfg,
        X_train, y_train,
        X_val, y_val,
        verbose=False
    )

    entry = {
        "trial": t,
        "cfg": cfg,
        "val_f1": res['val_f1'],
        # "val_acc": res['val_acc'],
        "val_loss": res['val_loss']
    }
    results.append(entry)
    print(res)

# save results
joblib.dump(results, PROJECT_PATH + "tuning_results.pkl")

print("Tuning completed!")


 Trial 1/30
{'val_loss': np.float64(1.0985858143320413), 'val_f1': 0.17106173268447, 'W': array([[ 0.00234303,  0.00945599, -0.00745146],
       [ 0.00210359,  0.00063222,  0.01418409],
       [ 0.00119537,  0.00131693, -0.01143707],
       ...,
       [ 0.00935162, -0.00265063,  0.01120491],
       [ 0.00438365,  0.00699142, -0.00409319],
       [-0.00302643, -0.0121959 ,  0.00018348]]), 'b': array([-4.10066006e-05, -3.96754117e-06,  4.49741417e-05]), 'history': [(np.float64(1.0985858143320413), 0.170973725105786), (np.float64(1.0985858236859016), 0.17116395697420062), (np.float64(1.0985858250073846), 0.17106173268447)]}

 Trial 2/30
{'val_loss': np.float64(1.0985704204292834), 'val_f1': 0.1574901981439022, 'W': array([[-0.01494402,  0.01074554,  0.00661988],
       [ 0.00019993,  0.00688581,  0.00668878],
       [-0.00780265, -0.00329703, -0.00181891],
       ...,
       [-0.00423651, -0.00508081, -0.02043588],
       [-0.00648777,  0.0116396 ,  0.01100413],
       [ 0.00490046,  0.

In [14]:
# best hyperparams
import joblib

results = joblib.load(PROJECT_PATH + "tuning_results.pkl")

best = max(results, key=lambda x: x['val_f1'])

print("Best trial configuration:")
print(best)

best_cfg = best["cfg"]

Best trial configuration:
{'trial': 23, 'cfg': {'opt': 'sgd', 'lr': 0.01, 'l2': 0.001, 'batch': 4096, 'momentum': 0.9, 'epochs': 3, 'seed': 9394, 'patience': 1}, 'val_f1': 0.17721143587636395, 'val_loss': np.float64(1.0987000013463426)}
