In [35]:
#OR Baseline Model for Book Rating

import random
import numpy as np
import pandas as pd
import math
from sklearn.cluster import KMeans
from pprint import pprint
import pickle
# OVERALL APPROACH:
# 1. Read csv file into pandas df
# 2. Group by class label
# 3. Calculate prior probabilities and likelihood pdfs
# 4. Create predict() function for using these pdfs to predict class labels
# 6. evaluate() performance by comparing model outputs to ground truth labels

In [36]:
def preprocess(filename):
    # process csv file into pandas dataframe
    df = pd.read_csv(filename)
    #df = df.iloc[:, 1:]
    return df

In [37]:
# Probability of all the labels
def calc_prior(data):
    prior_prob = {}

    labels = data.values[:, -1]
    n = len(labels)
    unique_labels, counts = np.unique(labels, return_counts=True)

    for i in range(len(unique_labels)):
        prior_prob[unique_labels[i]] = (counts[i] / n).round(2)

    return prior_prob

In [38]:
# 'Training' for 0R baseline: finds the class with the highest frequency
def zero_r_training(df):
    priors = calc_prior(df)
    maxlabel = list(priors.keys())[0]
    maxval = 0
    for label in priors.keys():
        label_count = 0
        for row in df.iterrows():
            if row[-1][-1] == label:
                label_count += 1
        if label_count > maxval:
            maxval = label_count
            maxlabel = label
    return priors, maxlabel

In [39]:
# Zero R prediction for test df
def zero_r_predict(zero_r_val, test_df):
    length = len(test_df)
    return pd.Series([zero_r_val]*length)

In [40]:
# First, preprocess train and test data
train_df = preprocess("project_data_files/book_rating_train.csv")
test_df = preprocess("project_data_files/book_rating_test.csv")

# Train 0R model
priors, label = zero_r_training(train_df)

# Predict 0R model on test df
predictions = zero_r_predict(label, test_df)
predictions

0       4.0
1       4.0
2       4.0
3       4.0
4       4.0
       ... 
5761    4.0
5762    4.0
5763    4.0
5764    4.0
5765    4.0
Length: 5766, dtype: float64

In [43]:
vocab = pickle.load(open("project_data_files/book_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
vocab

thing = pickle.load(open("project_data_files/book_text_features_countvec/train_desc_countvectorizer.pkl", "rb"))
thing

CountVectorizer(stop_words='english')