EECS 4412 Project - Phase II
Maryam Salarian
Analysis of the dataset + preprocessing + ....

In [None]:
# 1. load the dataset

import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
reviews = pd.read_csv("/content/drive/MyDrive/EECS4412/data/project/Books_rating.csv")

Mounted at /content/drive


In [None]:
# 2. display the dataset to get an idea

reviews.head(10)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...
5,826414346,Dr. Seuss: American Icon,,A2F6NONFUDB6UK,Malvin,2/2,4.0,1127174400,One of America's greatest creative talents,"""Dr. Seuss: American Icon"" by Philip Nel is a ..."
6,826414346,Dr. Seuss: American Icon,,A14OJS0VWMOSWO,Midwest Book Review,3/4,5.0,1100131200,A memorably excellent survey of Dr. Seuss' man...,Theodor Seuss Giesel was best known as 'Dr. Se...
7,826414346,Dr. Seuss: American Icon,,A2RSSXTDZDUSH4,J. Squire,0/0,5.0,1231200000,Academia At It's Best,When I recieved this book as a gift for Christ...
8,826414346,Dr. Seuss: American Icon,,A25MD5I2GUIW6W,"J. P. HIGBED ""big fellow""",0/0,5.0,1209859200,And to think that I read it on the tram!,Trams (or any public transport) are not usuall...
9,826414346,Dr. Seuss: American Icon,,A3VA4XFS5WNJO3,Donald Burnside,3/5,4.0,1076371200,Fascinating account of a genius at work,"As far as I am aware, this is the first book-l..."


In [None]:
# 3. as displayed, some of the books have multiple reviews and ratings
# handling approach: treat each review as a separate sample, and once sentiment assigned, take majority vote
# alternative to majority vote: take weighted sentiment, using review/helpfulness as weight, and assigning small weight to those with review/helpfulness = 0.
# size of the unaltered dataset
print("size of unaltered reviews dataset:", reviews.shape)

# checking data types, as non-numeric data type for rating will affect preprocessing
print ("\ndata type of attributes:\n", reviews.dtypes)

# rename relevant attrs for ease of reference: Title to title, review/score to rating, review/helpfulness to helpfulness, review/text to review
reviews.rename(columns={'Title':'title', 'review/score': 'rating', 'review/helpfulness': 'helpfulness', 'review/text':'review'}, inplace=True)

# drop all other cols
columns_to_keep = ["title", "rating", "helpfulness", "review"]
reviews = reviews[columns_to_keep]
print("size of dataset after dropping 6 out of 10 attributes:", reviews.shape)

# check the max and min value for rating (review/score) in the dataset
print("\nmin rating:", reviews["rating"].min())
print("max rating:", reviews["rating"].max())


size of unaltered reviews dataset: (3000000, 10)

data type of attributes:
 Id                     object
Title                  object
Price                 float64
User_id                object
profileName            object
review/helpfulness     object
review/score          float64
review/time             int64
review/summary         object
review/text            object
dtype: object
size of dataset after dropping 6 out of 10 attributes: (3000000, 4)

min rating: 1.0
max rating: 5.0


In [None]:
# 4. add new col, sentiment, based on review rating
# will be added to the entire dataset as target attribute
# rating >= 4 is positive: +1
# rating =< 2 is negative: -1
# rating =3 is netural: 0

# perform rating-specific preprocessing:

# a) convert rating to numreic -> from above data types we conclude that rating is already in numeric format: float64
# b) remove rows with missing or NaN rating
reviews = reviews.dropna(subset=["rating"])
# c) ensure all ratings are within range -> from above rating range, we confirm that all ratings are within [1,5]

# generate the new sentiment column
def assign_sentiment(x):
    if x >= 4:
        return 1
    elif x == 3:
        return 0
    else:
        return -1

# apply the above function to every value in the rating col, and store result in new col
reviews["sentiment"] = reviews["rating"].apply(assign_sentiment)

# convert helpfulness from object to numeric values, fill NaN with 0
def fraction_to_float(x):
    if isinstance(x, str) and "/" in x:
        try:
            num, denom = x.split("/")
            return float(num) / float(denom)
        except:
            return np.nan  # invalid fraction
    else:
        # try to convert directly to float
        return pd.to_numeric(x, errors='coerce')
reviews["helpfulness"] = reviews["helpfulness"].apply(fraction_to_float)
reviews["helpfulness"] = reviews["helpfulness"].fillna(0)

print("size of dataset after removing NaN ratings and adding new col:", reviews.shape)
reviews.head(5)


size of dataset after removing NaN ratings and adding new col: (3000000, 5)


Unnamed: 0,title,rating,helpfulness,review,sentiment
0,Its Only Art If Its Well Hung!,4.0,1.0,This is only for Julie Strain fans. It's a col...,1
1,Dr. Seuss: American Icon,5.0,1.0,I don't care much for Dr. Seuss but after read...,1
2,Dr. Seuss: American Icon,5.0,0.909091,"If people become the books they read and if ""t...",1
3,Dr. Seuss: American Icon,4.0,1.0,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",1
4,Dr. Seuss: American Icon,4.0,1.0,Philip Nel - Dr. Seuss: American IconThis is b...,1


In [None]:
pip install contractions



In [None]:
# 5. perform further preprocessing on the dataset
import re

# a) remove rows with missig reviews: NaN, '', ""
# drop rows where review is NaN
reviews = reviews.dropna(subset=["review"])
# remove empty strings or whitespace-only reviews
df = reviews[reviews["review"].str.strip().astype(bool)]
print("size of dataset after removing NaN or empty reviews:", reviews.shape)

# b) remove identical rows with duplicate review per book
reviews = reviews.drop_duplicates(subset=["title", "review"])
print("size of dataset after dropping duplicates:", reviews.shape)

# c) replace english contractions with full form: won't -> will not
contraction_map = {
    "can't": "can not",
    "won't": "will not",
    "n't": " not",
    "'re": " are",
    "'s": " is",
    "'d": " would",
    "'ll": " will",
    "'t": " not",
    "'ve": " have",
    "'m": " am"
}
# pattern = re.compile("({})".format("|".join(map(re.escape, contraction_map.keys()))))
# def expand_contractions_fast(text: str) -> str:
#     if not isinstance(text, str):
#         return text
#     return pattern.sub(lambda m: contraction_map[m.group(0)], text)
# chunksize = 50000
# for start in range(0, len(reviews), chunksize):
#     end = start + chunksize
#     reviews.loc[start:end, "review"] = (
#         reviews.loc[start:end, "review"].apply(expand_contractions_fast)
#     )

def expand_contractions_vectorized(series):
    for k, v in contraction_map.items():
        series = series.str.replace(k, v, regex=True)
    return series

reviews["review"] = expand_contractions_vectorized(reviews["review"])
print("contraction expansion completed.")


size of dataset after removing NaN or empty reviews: (2999992, 5)
size of dataset after dropping duplicates: (2616740, 5)
contraction expansion completed.


In [None]:
import re

# d) to prevent overfitting and reduce noise, remove reviews with < 2 words
reviews = reviews[reviews["review"].str.len() >= 2]
print("reviews with less than 2  words dropped.")

# e) lower case all characters in review
reviews["review"] = reviews["review"].str.lower()
print("all chars lowercased.")

# # f) remove leading and trailing white spaces + normalize space between chars
# reviews["review"] = reviews["review"].str.replace(r"\s+", " ", regex=True).str.strip()
# print("white spaces normalized.")

# # g) remove non-text/num chars, keep the spaces
# reviews["review"] = reviews["review"].str.replace(r"[^a-z0-9\s']", " ", regex=True)
# print("punctuation removal completed.")

clean_pattern = re.compile(r"[^a-z0-9\s']+")

def fast_clean(text: str) -> str:
    if not isinstance(text, str):
        return text
    # 1. Remove unwanted characters (punctuation, symbols)
    text = clean_pattern.sub(" ", text)
    # 2. Normalize whitespace (no need for regex here)
    text = " ".join(text.split())
    return text

chunksize = 50000
for start in range(0, len(reviews), chunksize):
    end = start + chunksize
    reviews.loc[start:end, "review"] = (
        reviews.loc[start:end, "review"].apply(fast_clean)
    )

reviews with less than 2  words dropped.
all chars lowercased.


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pickle
import tqdm

In [None]:
# 6. perfrom text - order them appropriately
# updated logic to use PorterStemmer > deletef after laptop auto reboot
# downloaded the result to /content/drive/MyDrive/EECS4412/data/project/reviews_cleaned.csv

# tokenize
def tokenize(text):
    return token_pattern.findall(text.lower())

# remove stopwords
stop_words = set(stopwords.words('english'))
negation_words = {"not", "no", "never", "none"}
stop_words -= negation_words
def remove_stopwords(tokens):
    return [t for t in tokens if t not in stop_words]

# define stemmer and find word stems
stemmer = PorterStemmer()
def stem_tokens(tokens):
    return [stemmer.stem(t) for t in tokens]

# put all steps together
def preprocess_text(text):
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    # return string for TF-IDF
    return " ".join(tokens)


# put steps together:
reviews.to_csv("/content/drive/MyDrive/EECS4412/data/project/reviews_cleaned.csv", index=False)

with open("/content/reviews_cleaned.pkl", "wb") as f:
    pickle.dump(reviews, f)

reviews.head(5)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


KeyboardInterrupt: 

In [2]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
reviews_cleaned = pd.read_csv("/content/drive/MyDrive/EECS4412/data/project/reviews_cleaned.csv")
reviews_cleaned.head(5)

Unnamed: 0,title,rating,helpfulness,review,sentiment
0,Its Only Art If Its Well Hung!,4.0,1.0,juli strain fan collect photo page worth nice ...,1
1,Dr. Seuss: American Icon,5.0,1.0,not care much dr seuss read philip nel book ch...,1
2,Dr. Seuss: American Icon,5.0,0.909091,peopl becom book read child father man dr seus...,1
3,Dr. Seuss: American Icon,4.0,1.0,theodor seuss geisel aka quot dr seuss quot on...,1
4,Dr. Seuss: American Icon,4.0,1.0,philip nel dr seuss american iconthi basic aca...,1


In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
import joblib

DTYPE = np.float32
PROJECT_PATH = "/content/drive/MyDrive/EECS4412/data/project/"
NGRAM_RANGE = (1,2)
MAX_FEATURES = 150_000
SAMPLE_SIZE = 150_000

# sample 150k reviews to fit the vectorizer
sample = reviews_cleaned.sample(SAMPLE_SIZE, random_state=42)["review"].astype(str).tolist()

vectorizer = CountVectorizer(
    ngram_range=NGRAM_RANGE,
    binary=True,
    dtype=DTYPE,
    max_features=MAX_FEATURES
)

vectorizer.fit(sample)
vocab = vectorizer.get_feature_names_out()
print(f"Learned vocab size: {len(vocab)}")
joblib.dump(vectorizer, PROJECT_PATH + "vectorizer.pkl")

reviews_list = reviews_cleaned["review"].astype(str).tolist()
total_reviews = len(reviews_list)
print("total reviews:", total_reviews)


Learned vocab size: 150000
total reviews: 2616728


In [6]:
# stream DF counting in chunks
from scipy.sparse import csr_matrix

CHUNK_SIZE = 100_000

total_reviews = len(reviews_cleaned)
df_counts = np.zeros(len(vocab), dtype=DTYPE)

for start in tqdm(range(0, total_reviews, CHUNK_SIZE), desc="Counting DF"):
    end = min(start + CHUNK_SIZE, total_reviews)
    batch = reviews_cleaned.iloc[start:end]["review"].astype(str).tolist()

    X_chunk = vectorizer.transform(batch)  # sparse CSR matrix
    # sum over rows (axis=0) -> number of docs each token appears in
    df_chunk = np.array((X_chunk > 0).sum(axis=0)).ravel()
    df_counts += df_chunk

np.save(PROJECT_PATH + "df_counts.npy", df_counts)
print("Document frequency counting complete!")

Counting DF: 100%|██████████| 27/27 [07:39<00:00, 17.03s/it]

Document frequency counting complete!





In [7]:
# apply DF thresholds and reduce vocab

N = len(reviews_cleaned)
MIN_DF_PROP = 1e-4
MAX_DF_PROP = 0.95
MAX_KEEP = 50_000

min_df_cutoff = int(N * MIN_DF_PROP)
max_df_cutoff = int(N * MAX_DF_PROP)

# boolean array for vocabs
mask = (df_counts >= min_df_cutoff) & (df_counts <= max_df_cutoff)
indices_kept = np.where(mask)[0]

reduced_vocab = vocab[indices_kept]

# keep top MAX_KEEP by DF if necessary
if len(reduced_vocab) > MAX_KEEP:
    sorted_idx = np.argsort(df_counts[indices_kept])[::-1]
    top_idx = sorted_idx[:MAX_KEEP]
    indices_kept = indices_kept[top_idx]
    reduced_vocab = vocab[indices_kept]

np.save(PROJECT_PATH + "reduced_vocab.npy", reduced_vocab)
print("Reduced vocab size:", len(reduced_vocab))

Reduced vocab size: 50000


In [6]:
# build TF-IDF vectorizer with reduced vocab
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
import joblib

PROJECT_PATH = "/content/drive/MyDrive/EECS4412/data/project/"
NGRAM_RANGE = (1,2)
SAMPLE_SIZE = 150_000

# load reduced vocab
reduced_vocab = np.load(PROJECT_PATH + "reduced_vocab.npy", allow_pickle=True)

tfidf = TfidfVectorizer(
    ngram_range=NGRAM_RANGE,
    vocabulary={t: i for i, t in enumerate(reduced_vocab)},
    # reduce impact of very frequent words, used instead of raw frequency
    sublinear_tf=True,
    # L2 norm, prevents longer docs from dominating
    norm='l2',
    token_pattern=r"[a-zA-Z]+(?:'[a-z]+)?"
)

sample_texts = reviews_cleaned.sample(SAMPLE_SIZE, random_state=42)['review'].astype(str).tolist()
print("Sample text creted!")

tfidf.fit(sample_texts)
joblib.dump(tfidf, PROJECT_PATH + "tfidf_reduced_vocab.pkl")
print("TF-IDF computed on reduced vocab!")

Sample text creted!
TF-IDF computed on reduced vocab!


In [8]:
# apply TF-IDF to the dataset and save in chunks

from scipy.sparse import save_npz, vstack
from joblib import load

N = len(reviews_cleaned)
CHUNK_SIZE = 100_000

# load tfidf
tfidf = load(PROJECT_PATH + "tfidf_reduced_vocab.pkl")

tfidf_chunk_files = []

for start in tqdm(range(0, N, CHUNK_SIZE), desc="TF-IDF transform"):
    end = min(start + CHUNK_SIZE, N)
    batch = reviews_cleaned.iloc[start:end]['review'].astype(str).tolist()
    X_chunk = tfidf.transform(batch)
    fname = PROJECT_PATH + f"tfidf_chunk_{start}_{end}.npz"
    save_npz(fname, X_chunk.astype(np.float32))
    tfidf_chunk_files.append(fname)

print("TF-IDF transformation complete!")

TF-IDF transform: 100%|██████████| 27/27 [09:25<00:00, 20.94s/it]

TF-IDF transformation complete!





In [11]:
# stratified sampling for chi sqaure - supervised feature selection

from sklearn.model_selection import train_test_split

clean_df = reviews_cleaned.dropna(subset=['review'])
print("Size of dataset after dropping NaN reviews:", clean_df.shape)

# sample only 5% of the dataset - 5% of data ~ 130k
X_sample, _, y_sample, _ = train_test_split(
    clean_df['review'],
    clean_df['sentiment'],
    stratify=clean_df['sentiment'],
    test_size=0.95,
    random_state=42
)

X_sample_tfidf = tfidf.transform(X_sample)
print("TF-IDF transformation on sample completed!")


Size of dataset after dropping NaN reviews: (2616702, 5)
TF-IDF transformation on sample completed!


In [12]:
# apply chi square

from sklearn.feature_selection import SelectKBest, chi2

FINAL_K = 20_000

selector = SelectKBest(chi2, k=FINAL_K)
selector.fit(X_sample_tfidf, y_sample)
selected_indices = selector.get_support(indices=True)
final_features = reduced_vocab[selected_indices]

np.save(PROJECT_PATH + "final_vocab.npy", final_features)
joblib.dump(selector, PROJECT_PATH + "chi2_selector.pkl")
print("Chi square feature selection completed. Final features:", len(final_features))

Chi square feature selection completed. Final features: 20000


In [None]:
# reduce full TF-IDF chunks to chi-square-selected features
from scipy.sparse import save_npz, vstack, load_npz

final_chunk_files = []

for fname in tfidf_chunk_files:
    X = sparse.load_npz(fname)
    X_selected = X[:, selected_indices]
    out_fname = fname.replace("tfidf_chunk_", "final_chunk_")
    save_npz(out_fname, X_selected)
    final_chunk_files.append(out_fname)