In [38]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import KBinsDiscretizer

In [7]:
def learnLanguage(file, doc2vec):
    
    
    data = pd.read_csv(file)
    X = pd.read_csv(doc2vec, index_col = False, delimiter = ',', header = None)
    y = data["Language"]
    
    y = pd.Series(y)
    # Assuming you have a feature matrix `X` and a target variable `y`
    # X should contain other features like doc2vec and word frequency counts
    # y should contain the language labels (with missing values)

    # Split the dataset into instances with and without missing language values
    X_with_language = X[~y.isnull()]
    y_with_language = y[~y.isnull()]
    X_missing_language = X[y.isnull()]

    # Split the dataset with language into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_with_language, y_with_language, test_size=0.2, random_state=42)

    # Train a Random Forest classifier on the instances with language
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)

    # Predict the missing language values using the trained model
    imputed_language = rf.predict(X_missing_language)

    # Merge the imputed language values with the original dataset
    y[y.isnull()] = imputed_language

    # Now you can proceed with your machine learning algorithm using the complete dataset
    return y

In [19]:
# Function that converts a categorical feature into nominal, while maintaing categorical properties
def numericise_categorical_data(df, feature):
    df[feature] = df[feature].astype(str)
    df[feature] = df[feature].astype('category')
    df_encoded = pd.get_dummies(df, columns=[feature])
    
    encoder = LabelEncoder()
    df[feature] = encoder.fit_transform(df[feature])

In [21]:
# Load the dataset
train_df = pd.read_csv('project_data_files/book_rating_train.csv')
test_df = pd.read_csv('project_data_files/book_rating_test.csv')

# Add the predicted languages to each entry
lang = learnLanguage('project_data_files/book_rating_train.csv', "project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv")
train_df['Language'] = lang

In [22]:
def add_letter_to_cols(df, letter):
    for column in df.columns:
        new_name = letter + str(column)
        df.rename(columns={column: new_name}, inplace=True)

In [23]:
train_df = train_df.drop('Name', axis=1)
train_df = train_df.drop('Description', axis=1)
train_df = train_df.drop('Authors', axis=1)


# Remove the title, description and replace with doc2vec
book_name_features = pd.read_csv(r"project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
add_letter_to_cols(book_name_features, "n")

book_desc_features = pd.read_csv(r"project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
add_letter_to_cols(book_desc_features, "d")

book_auth_features = pd.read_csv(r"project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
add_letter_to_cols(book_auth_features, "a")

combined_df = pd.concat([train_df, book_name_features, book_desc_features, book_auth_features], axis=1)

In [26]:
# Perform label encoding for publishers, language
#combined_df['Publisher'] = label_encoder.fit_transform(combined_df['Publisher'])
#combined_df['Language'] = label_encoder.fit_transform(combined_df['Language'])
numericise_categorical_data(combined_df, 'Publisher')
numericise_categorical_data(combined_df, 'Language')

#combined_df = pd.concat([book_name_features, book_desc_features, train_df['rating_label']], axis=1)
# Separate the feature columns (X) and the target column (y)
X = combined_df.copy()
X = X.drop('rating_label', axis=1)
y = combined_df['rating_label']

In [27]:
X

Unnamed: 0,PublishYear,PublishMonth,PublishDay,Publisher,Language,pagesNumber,n0,n1,n2,n3,...,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19
0,2005,6,1,3664,1,48,0.052262,-0.263308,0.026872,0.128574,...,0.329671,0.343979,0.018261,0.115687,-0.111172,0.068306,0.158065,0.053510,-0.136804,-0.084448
1,1991,10,1,1108,1,364,-0.129112,0.021312,0.159166,-0.072448,...,0.400349,0.065201,0.349188,0.020555,0.281087,0.231422,0.129853,-0.213233,-0.081253,-0.204687
2,2005,3,31,810,1,32,-0.170058,0.052351,-0.013406,0.099001,...,0.225617,-0.004355,0.173353,0.087015,0.106534,0.040950,0.209152,-0.215313,-0.177547,-0.178094
3,2004,9,1,480,1,293,0.250849,0.021555,0.091047,-0.041589,...,0.133304,-0.069995,0.206028,0.089625,0.157605,0.131767,0.244849,-0.321698,-0.198365,-0.208098
4,2005,7,7,2820,1,352,-0.041681,0.038051,-0.051164,-0.076813,...,0.224210,0.049880,0.003623,0.062291,-0.030742,0.130882,0.295086,-0.061550,-0.244197,-0.272161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23058,1997,8,1,7,1,120,0.007497,0.000220,0.019723,-0.003321,...,0.352087,-0.003458,0.148963,0.063023,0.207720,0.070757,0.372283,-0.202811,-0.110761,-0.332954
23059,2005,6,1,1603,1,32,-0.024484,0.000467,-0.015977,0.086630,...,0.036516,-0.076491,0.343184,-0.041826,0.242226,0.140699,0.104105,-0.140699,-0.004226,-0.291587
23060,1989,2,15,3220,1,132,-0.099309,-0.046230,-0.033294,0.242591,...,0.359741,-0.077176,0.297625,0.172478,0.149067,-0.003060,0.270723,-0.324030,-0.264965,-0.269051
23061,1998,4,21,2550,1,136,-0.038388,0.065679,-0.159324,-0.048682,...,0.266749,-0.052155,0.195081,0.126226,0.097224,0.052020,0.191786,-0.234276,-0.214879,-0.224103


In [34]:
# MODEL 1: Discretise all data, then use Categorical Naive Bayes

# Assuming 'X' is your feature matrix and 'y' is your target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Select categorical and continuous features
categorical_features = ['Publisher', 'Language']  # Add your categorical feature column names here
continuous_features = ['con_feature1', 'con_feature2']  # Add your continuous feature column names here

# Create and train the Categorical Naive Bayes classifier
nb_classifier = CategoricalNB()
nb_classifier.fit(X_train[categorical_features], y_train)

# Predict on the test set
y_pred = nb_classifier.predict(X_test[categorical_features])

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

combined = pd.concat([pd.Series(y_pred), y_test.reset_index()], axis=1)
combined

Accuracy: 0.7045306741816605


Unnamed: 0,0,index,rating_label
0,4.0,247,4.0
1,4.0,12038,3.0
2,4.0,1580,4.0
3,3.0,3128,4.0
4,4.0,20531,4.0
...,...,...,...
4608,4.0,12586,3.0
4609,4.0,7683,4.0
4610,4.0,12912,4.0
4611,4.0,9134,4.0


In [49]:
# Load your dataset into a pandas DataFrame

# Select the continuous features you want to discretize (everything except for language, publisher and rating_label features)
continuous_features = [col for col in X.columns if not col.startswith('Language') or not col.startswith('Publisher') or not col.startswith('rating_label')]

# Discretize the continuous features using equal-weight binning
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
X_train_continuous = X_train[continuous_features]
X_test_continuous = X_test[continuous_features]
X_train_continuous_discretized = discretizer.fit_transform(X_train_continuous)
X_test_continuous_discretized = discretizer.transform(X_test_continuous)

# Replace the original continuous features with the discretized values
X_train_discretized = X_train.copy()
X_test_discretized = X_test.copy()
X_train_discretized[continuous_features] = X_train_continuous_discretized
X_test_discretized[continuous_features] = X_test_continuous_discretized

# Concatenate the categorical and discretized continuous features
X_train_combined = pd.concat([X_train_discretized[categorical_features], X_train_discretized[continuous_features]], axis=1)
X_test_combined = pd.concat([X_test_discretized[categorical_features], X_test_discretized[continuous_features]], axis=1)

# Create and train the Categorical Naive Bayes classifier
nb_classifier = CategoricalNB()
nb_classifier.fit(X_train_combined, y_train)

# Predict on the test set
y_pred = nb_classifier.predict(X_test_combined)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

combined = pd.concat([pd.Series(y_pred), y_test.reset_index()], axis=1)
print(pd.Series(y_pred).value_counts())
print(pd.Series(y_test).value_counts())

Accuracy: 0.6516366789507912
4.0    3879
3.0     418
5.0     316
dtype: int64
4.0    3281
3.0    1136
5.0     196
Name: rating_label, dtype: int64


In [None]:
#TODO: CROSS VALIDATION, FEATURE SELECTION ON NB MODEL