In [50]:
#OR Baseline Model for Book Rating

import random
import numpy as np
import pandas as pd
import math
from sklearn.cluster import KMeans
from pprint import pprint
import pickle
import scipy
# OVERALL APPROACH:
# 1. Read csv file into pandas df
# 2. Group by class label
# 3. Calculate prior probabilities and likelihood pdfs
# 4. Create predict() function for using these pdfs to predict class labels
# 6. evaluate() performance by comparing model outputs to ground truth labels

In [51]:
def preprocess(filename):
    # process csv file into pandas dataframe
    df = pd.read_csv(filename)
    #df = df.iloc[:, 1:]
    return df

In [52]:
# Probability of all the labels
def calc_prior(data):
    prior_prob = {}

    labels = data.values[:, -1]
    n = len(labels)
    unique_labels, counts = np.unique(labels, return_counts=True)

    for i in range(len(unique_labels)):
        prior_prob[unique_labels[i]] = (counts[i] / n).round(2)

    return prior_prob

In [53]:
# 'Training' for 0R baseline: finds the class with the highest frequency
def zero_r_training(df):
    priors = calc_prior(df)
    maxlabel = list(priors.keys())[0]
    maxval = 0
    for label in priors.keys():
        label_count = 0
        for row in df.iterrows():
            if row[-1][-1] == label:
                label_count += 1
        if label_count > maxval:
            maxval = label_count
            maxlabel = label
    return priors, maxlabel

In [54]:
# Zero R prediction for test df
def zero_r_predict(zero_r_val, test_df):
    length = len(test_df)
    return pd.Series([zero_r_val]*length)

In [55]:
# First, preprocess train and test data
train_df = preprocess("project_data_files/book_rating_train.csv")
test_df = preprocess("project_data_files/book_rating_test.csv")

# Train 0R model
priors, label = zero_r_training(train_df)

# Predict 0R model on test df
predictions = zero_r_predict(label, test_df)
predictions

0       4.0
1       4.0
2       4.0
3       4.0
4       4.0
       ... 
5761    4.0
5762    4.0
5763    4.0
5764    4.0
5765    4.0
Length: 5766, dtype: float64

In [56]:
vocab = pickle.load(open("project_data_files/book_text_features_countvec/train_authors_countvectorizer.pkl", "rb"))

vocab_dict = vocab.vocabulary_

thing = pickle.load(open("project_data_files/book_text_features_countvec/train_desc_countvectorizer.pkl", "rb"))


In [58]:
scipy.sparse.load_npz('project_data_files/book_text_features_countvec/train_name_vec.npz')

<23063x20766 sparse matrix of type '<class 'numpy.int64'>'
	with 99477 stored elements in Compressed Sparse Row format>

In [59]:
pd.read_csv(r"project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.052262,-0.263308,0.026872,0.128574,-0.161565,-0.127520,0.249588,0.037621,-0.074043,0.072854,...,-0.172811,0.098389,-0.062941,0.118057,-0.065377,0.227973,0.218879,-0.151266,-0.048105,0.300822
1,-0.129112,0.021312,0.159166,-0.072448,0.036028,-0.093721,0.129199,0.069736,-0.253263,-0.066424,...,0.245650,-0.049657,0.072740,-0.055925,-0.000046,0.140500,0.067133,-0.238091,0.109774,-0.156772
2,-0.170058,0.052351,-0.013406,0.099001,0.083173,-0.161439,0.048635,0.089419,-0.072266,-0.063164,...,-0.033781,0.093943,0.132654,0.030295,0.102714,0.154334,0.129325,-0.231493,0.007541,-0.098540
3,0.250849,0.021555,0.091047,-0.041589,-0.040949,0.240260,0.415056,0.027029,-0.172413,-0.135485,...,0.020762,-0.149720,0.150557,0.294355,0.001157,0.285179,0.049340,-0.037548,0.042920,0.176173
4,-0.041681,0.038051,-0.051164,-0.076813,0.096855,-0.215943,0.152729,0.267636,-0.079954,-0.065560,...,0.191644,0.044182,0.054631,-0.025782,0.049917,0.122052,-0.084216,-0.096424,-0.068681,-0.005293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23058,0.007497,0.000220,0.019723,-0.003321,0.021097,-0.129420,0.130302,-0.037361,-0.004281,-0.255112,...,-0.000418,-0.062899,0.048064,0.029612,0.191065,0.096081,-0.100516,-0.190299,0.224559,0.086601
23059,-0.024484,0.000467,-0.015977,0.086630,0.082127,-0.174537,0.011694,0.111608,-0.106961,-0.147956,...,0.150964,-0.029046,0.171029,-0.072123,-0.004459,0.247430,0.111973,0.019573,0.070569,-0.112066
23060,-0.099309,-0.046230,-0.033294,0.242591,-0.055477,-0.033886,0.026869,0.038410,-0.126636,0.127742,...,0.193755,-0.118570,0.006740,-0.108623,-0.036143,0.168113,0.136478,0.087885,0.113180,0.000569
23061,-0.038388,0.065679,-0.159324,-0.048682,0.054175,0.317751,0.065931,-0.126021,-0.105057,-0.147185,...,0.009007,0.154127,0.219128,-0.305824,-0.017904,-0.059886,0.108616,0.041879,-0.138893,-0.044187
