In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [17]:
def preprocess_csv(file_path):
    # Read the CSV file into a pandas dataframe
    df = pd.read_csv(file_path)
    
    # Handling missing values
    # Replace any missing values (NaN) with appropriate strategies
    # For numerical columns, you can use methods like mean, median, or interpolation
    df.fillna(df.mean(), inplace=True)  # Example: Replace missing values with column means
    
    # For categorical columns, you can use methods like mode or a constant value
    df.fillna('Unknown', inplace=True)  # Example: Replace missing values with 'Unknown'
    
    # Process or remodel the description columns
    # Depending on your specific use case, you can apply techniques like text preprocessing, feature extraction, or encoding
    
    # Text preprocessing: Remove special characters, convert to lowercase, etc.
    #df['description'] = df['description'].str.replace('[^\w\s]', '').str.lower()
    
    # Feature extraction: Extract relevant information from the description
    # You can use techniques like TF-IDF, word embeddings, or topic modeling to extract features
    
    # Encoding: Convert categorical description columns into numeric representations
    # Techniques like one-hot encoding or word embeddings can be useful
    
    # Return the preprocessed dataframe
    return df

In [18]:
def split_dataset(df, test_size):
    # Separate the features and target variable
    X = df.drop('target', axis=1)
    y = df['target']
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Return the split datasets
    return X_train, X_test, y_train, y_test


In [19]:
def get_accuracy(test_df, results_series):
    successful_count = 0
    for test, result in zip(test_df.iterrows(), results_series.iteritems()):
        if test[-1][-1] == result[1]:
            successful_count += 1
    if (len(test_df) != 0):
        return successful_count / (len(test_df))
    return 0

In [20]:
def get_precision(test_df, results_series, positive_label):
    tp_count = 0
    fp_count = 0
    for test, result in zip(test_df.iterrows(), results_series.iteritems()):
        if result[1] == positive_label:
            if test[-1][-1] == result[1]:
                tp_count += 1
            else:
                fp_count += 1
    if (tp_count + fp_count != 0):
        return tp_count / (tp_count + fp_count)
    return 0

In [21]:
def get_recall(test_df, results_series, positive_label):
    tp_count = 0
    fn_count = 0
    for test, result in zip(test_df.iterrows(), results_series.iteritems()):
        if test[-1][-1] == positive_label:
            if test[-1][-1] == result[1]:
                tp_count += 1
            else:
                fn_count += 1
    if (tp_count + fn_count != 0):
        return tp_count / (tp_count + fn_count)
    return 0

In [22]:
def add_letter_to_cols(df, letter):
    for column in df.columns:
        new_name = letter + str(column)
        df.rename(columns={column: new_name}, inplace=True)

In [23]:
df = preprocess_csv('project_data_files/book_rating_train.csv')

  df.fillna(df.mean(), inplace=True)  # Example: Replace missing values with column means


In [24]:
book_name_features = pd.read_csv(r"project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
add_letter_to_cols(book_name_features, "n")

book_desc_features = pd.read_csv(r"project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
add_letter_to_cols(book_desc_features, "d")

book_auth_features = pd.read_csv(r"project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
add_letter_to_cols(book_auth_features, "a")

In [27]:
# Perform label encoding for publishers, language
label_encoder = LabelEncoder()
#combined_df['Publisher'] = label_encoder.fit_transform(combined_df['Publisher'])
#combined_df['Language'] = label_encoder.fit_transform(combined_df['Language'])

combined_df = pd.concat([book_name_features, book_auth_features, book_desc_features, df['rating_label']], axis=1)
# Separate the feature columns (X) and the target column (y)
X = combined_df.drop('rating_label', axis=1)
y = combined_df['rating_label']

combined_df

Unnamed: 0,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,...,d91,d92,d93,d94,d95,d96,d97,d98,d99,rating_label
0,0.052262,-0.263308,0.026872,0.128574,-0.161565,-0.127520,0.249588,0.037621,-0.074043,0.072854,...,1.096503,0.894538,-0.386222,1.000658,1.094646,-0.897948,0.256250,-0.743381,-0.046537,4.0
1,-0.129112,0.021312,0.159166,-0.072448,0.036028,-0.093721,0.129199,0.069736,-0.253263,-0.066424,...,2.018345,-0.515164,0.510041,1.042953,0.034085,0.397630,0.180119,-0.133072,1.251777,4.0
2,-0.170058,0.052351,-0.013406,0.099001,0.083173,-0.161439,0.048635,0.089419,-0.072266,-0.063164,...,-0.043291,0.166269,0.443516,0.360877,0.637700,-0.399422,-0.217829,0.095041,0.030425,4.0
3,0.250849,0.021555,0.091047,-0.041589,-0.040949,0.240260,0.415056,0.027029,-0.172413,-0.135485,...,-1.064901,0.956356,0.537667,-1.156633,1.138308,0.287945,0.809811,-1.180691,-0.075178,4.0
4,-0.041681,0.038051,-0.051164,-0.076813,0.096855,-0.215943,0.152729,0.267636,-0.079954,-0.065560,...,-0.730932,-0.893566,0.982820,0.190981,0.605344,0.236092,0.653281,-0.581590,-0.850868,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23058,0.007497,0.000220,0.019723,-0.003321,0.021097,-0.129420,0.130302,-0.037361,-0.004281,-0.255112,...,2.021390,0.418629,-0.371224,0.595000,0.869552,-3.437345,1.491958,2.093727,1.478695,4.0
23059,-0.024484,0.000467,-0.015977,0.086630,0.082127,-0.174537,0.011694,0.111608,-0.106961,-0.147956,...,0.234319,0.114523,0.223425,0.818674,0.719629,-1.334342,-1.144812,-0.270687,-1.546596,4.0
23060,-0.099309,-0.046230,-0.033294,0.242591,-0.055477,-0.033886,0.026869,0.038410,-0.126636,0.127742,...,-0.308627,-0.630947,-0.264485,0.316840,0.305589,-0.123598,-0.424452,-1.336598,0.163445,4.0
23061,-0.038388,0.065679,-0.159324,-0.048682,0.054175,0.317751,0.065931,-0.126021,-0.105057,-0.147185,...,-0.085357,-0.113327,1.173376,1.244604,1.042439,-0.130578,0.552256,1.143148,-0.685621,4.0


In [41]:
# Load your dataset into a pandas DataFrame

# Select the continuous features you want to discretize
continuous_features = list(combined_df.drop('rating_label', axis=1).columns)
print(continuous_features)

# Discretize the continuous features using equal width binning
n_bins = 5  # Number of bins
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
discretized_data = discretizer.fit_transform(combined_df[continuous_features])

# Create a new DataFrame with discretized features
df_discretized = pd.DataFrame(discretized_data, columns=continuous_features)

# Perform feature selection using mutual information
target_variable = 'rating_label'  # Your target variable
X = df_discretized  # Features
y = combined_df[target_variable]  # Target variable

selector = SelectKBest(score_func=mutual_info_classif, k='all')
selected_features = selector.fit_transform(X, y)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [X.columns[idx] for idx in selected_feature_indices]
selected_feature_scores = selector.scores_[selected_feature_indices]

# Sort features by score in descending order
sorted_features = sorted(zip(selected_feature_names, selected_feature_scores),
                         key=lambda x: x[1], reverse=True)

# Print the selected feature names and their mutual information scores in order
print("Selected Features and Mutual Information Scores (in descending order):")
for feature, score in sorted_features:
    print(f"{feature}: {score}")


['n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'n15', 'n16', 'n17', 'n18', 'n19', 'n20', 'n21', 'n22', 'n23', 'n24', 'n25', 'n26', 'n27', 'n28', 'n29', 'n30', 'n31', 'n32', 'n33', 'n34', 'n35', 'n36', 'n37', 'n38', 'n39', 'n40', 'n41', 'n42', 'n43', 'n44', 'n45', 'n46', 'n47', 'n48', 'n49', 'n50', 'n51', 'n52', 'n53', 'n54', 'n55', 'n56', 'n57', 'n58', 'n59', 'n60', 'n61', 'n62', 'n63', 'n64', 'n65', 'n66', 'n67', 'n68', 'n69', 'n70', 'n71', 'n72', 'n73', 'n74', 'n75', 'n76', 'n77', 'n78', 'n79', 'n80', 'n81', 'n82', 'n83', 'n84', 'n85', 'n86', 'n87', 'n88', 'n89', 'n90', 'n91', 'n92', 'n93', 'n94', 'n95', 'n96', 'n97', 'n98', 'n99', 'a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'd0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16', 'd17', 'd18', 'd19', 'd20', 'd21', 'd22', 'd23', 'd24', 'd25', 'd26', 

In [42]:
# Calculate averages for each letter-group
averages = {}
for feature, score in sorted_features:
    letter_group = feature[0]  # Get the first character of the feature name
    if letter_group not in averages:
        averages[letter_group] = []
    averages[letter_group].append(score)

# Compute the average for each letter-group
for letter_group, scores in averages.items():
    average_score = sum(scores) / len(scores)
    print(f"Average score for {letter_group}-features: {average_score}")


Average score for d-features: 0.0028322501282761745
Average score for n-features: 0.002405192864826591
Average score for a-features: 0.002568532555710723
