In [1]:
import pandas as pd

# Load the datasets with only the required columns
df1 = pd.read_csv('C:/Users/Guest01/Documents/dataset_anime/archive/final_animedataset.csv', usecols=['user_id', 'anime_id', 'my_score', 'score', 'scored_by', 'gender', 'popularity'])
df2 = pd.read_csv('C:/Users/Guest01/Documents/dataset_anime/archive/users-score-2023.csv', usecols=['user_id', 'anime_id', 'Anime Title', 'rating'])

# Merge the datasets on user_id and anime_id
final_df = pd.merge(df2, df1, on=['user_id', 'anime_id'])

# Display the first few rows to check the data
print(final_df.head())


   user_id  anime_id             Anime Title  rating  my_score gender  score  \
0        1        21               One Piece       9         9   Male   8.54   
1        1        48             .hack//Sign       7         7   Male   7.09   
2        1       320                  A Kite       5         5   Male   6.66   
3        1        49        Aa! Megami-sama!       8         8   Male   7.38   
4        1       304  Aa! Megami-sama! Movie       8         8   Male   7.63   

   scored_by  popularity  
0     423868          35  
1      61485         650  
2      18934        1946  
3      20930        1807  
4      18571        2007  


In [2]:
# Frequency encoding for categorical columns
categorical_columns = ['gender', 'Anime Title']

for col in categorical_columns:
    freq = final_df[col].value_counts()
    final_df[col] = final_df[col].map(freq)

# Handle missing values by filling with median for numerical columns
numerical_columns = ['my_score', 'score', 'scored_by', 'popularity']
for col in numerical_columns:
    final_df[col].fillna(final_df[col].median(), inplace=True)

# Display the first few rows after preprocessing
print(final_df.head())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df[col].fillna(final_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df[col].fillna(final_df[col].median(), inplace=True)


   user_id  anime_id  Anime Title  rating  my_score   gender  score  \
0        1        21        10462       9         9  4175028   8.54   
1        1        48         5341       7         7  4175028   7.09   
2        1       320         1892       5         5  4175028   6.66   
3        1        49         2448       8         8  4175028   7.38   
4        1       304         2395       8         8  4175028   7.63   

   scored_by  popularity  
0     423868          35  
1      61485         650  
2      18934        1946  
3      20930        1807  
4      18571        2007  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df[col].fillna(final_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df[col].fillna(final_df[col].median(), inplace=True)


In [3]:
# Define features (X) and target (y)
X = final_df.drop(columns=['rating'])  # All columns except 'rating' are features
y = final_df['rating']  # 'rating' is the target variable

# Display the shapes of X and y to confirm
print(f'X shape: {X.shape}, y shape: {y.shape}')


X shape: (6021419, 8), y shape: (6021419,)


In [4]:
from sklearn.model_selection import train_test_split

# Function to sample data
def sample_data(X, y, sample_size):
    if isinstance(sample_size, float):
        if 0 < sample_size < 1.0:
            return train_test_split(X, y, test_size=0.2, train_size=sample_size, random_state=42)
        elif sample_size == 1.0:
            return train_test_split(X, y, test_size=0.001, random_state=42)
        else:
            raise ValueError("sample_size as float must be in the range (0.0, 1.0) or equal to 1.0.")
    elif isinstance(sample_size, int):
        if sample_size > len(X):
            raise ValueError(f"sample_size {sample_size} exceeds the number of available samples {len(X)}.")
        sampled_X = X.sample(n=sample_size, random_state=42)
        sampled_y = y.loc[sampled_X.index]
        return train_test_split(sampled_X, sampled_y, test_size=0.2, random_state=42)
    else:
        raise ValueError("sample_size must be a float or an integer")


In [5]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np
import time
import psutil

# Function to calculate and return metrics
def calculate_metrics(X_train, X_test, y_train, y_test):
    lgbm = LGBMRegressor()

    # Define enhanced hyperparameters for RandomizedSearchCV
    param_distributions = {
        'n_estimators': [30, 50],  # Number of boosting stages
        'learning_rate': [0.05, 0.1],  # Step size shrinkage
        'max_depth': [3, 5, 7, 10, 15, 20],  # Maximum depth of individual trees
        'min_child_samples': [20, 30, 40],  # Minimum number of samples in a leaf
        'subsample': [0.8, 0.9, 1.0],  # Fraction of samples used for training
        'colsample_bytree': [0.8, 0.9, 1.0],  # Fraction of features used per tree
        'force_col_wise': [True],  # Force data to be stored column-wise
        'reg_alpha': [0, 0.01, 0.1, 1, 10],  # L1 regularization term
        'reg_lambda': [0, 0.01, 0.1, 1, 10]  # L2 regularization term
    }

    random_search = RandomizedSearchCV(
        lgbm,
        param_distributions,
        n_iter=10,
        cv=5,
        scoring='neg_mean_squared_error',
        random_state=42,
        n_jobs=-1
    )

    start_time = time.time()
    start_cpu = psutil.cpu_percent(interval=None)
    random_search.fit(X_train, y_train)
    end_cpu = psutil.cpu_percent(interval=None)
    end_time = time.time()

    # Calculate time and CPU usage
    execution_time = end_time - start_time
    avg_cpu_usage = (start_cpu + end_cpu) / 2

    y_pred = random_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Calculate the range of the target variable
    target_range = y_train.max() - y_train.min()

    # Calculate normalized RMSE (nRMSE)
    nrmse = rmse / target_range
    
    memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
    normalized_time = execution_time / memory_usage_MB
    
    return {
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2,
        'MSE': mse,
        'nRMSE': nrmse,  # Normalized RMSE
        'Execution Time (Raw)': execution_time,  # Raw execution time
        'Normalized Time (s/MB)': normalized_time,  # Normalized execution time
        'Average CPU Usage': avg_cpu_usage
    }


In [6]:
# Define sample sizes
sample_sizes = [1.0, 0.5, 0.25, 0.125, 100, 1000, 10000, 100000]

# Initialize metrics storage
metrics_list = []
total_execution_time = 0
total_cpu_usage = 0
total_memory_usage_MB = 0

# Loop through each sample size
for size in sample_sizes:
    try:
        X_train_sample, X_test_sample, y_train_sample, y_test_sample = sample_data(X, y, size)
        metrics = calculate_metrics(X_train_sample, X_test_sample, y_train_sample, y_test_sample)
        metrics['Sample Size'] = size
        metrics_list.append(metrics)

        # Accumulate total       
        total_execution_time += metrics['Execution Time (Raw)']
        total_cpu_usage += metrics['Average CPU Usage']
        total_memory_usage_MB += X_train_sample.memory_usage(deep=True).sum() / (1024 ** 2)

        print(f"Metrics for sample size {size}:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred for sample size {size}: {e}")


[LightGBM] [Info] Total Bins 1521
[LightGBM] [Info] Number of data points in the train set: 6015397, number of used features: 8
[LightGBM] [Info] Start training from score 7.619075
Metrics for sample size 1.0:
RMSE: 0.2521050413009392
MAPE: 0.010311401305564119
R2: 0.9771308953093655
MSE: 0.06355695184934826
nRMSE: 0.028011671255659912
Execution Time (Raw): 233.95305609703064
Normalized Time (s/MB): 0.566411216061236
Average CPU Usage: 57.6
Sample Size: 1.0
--------------------------------------------------
[LightGBM] [Info] Total Bins 1522
[LightGBM] [Info] Number of data points in the train set: 3010709, number of used features: 8
[LightGBM] [Info] Start training from score 7.618789
Metrics for sample size 0.5:
RMSE: 0.26910562361830304
MAPE: 0.01141358321268742
R2: 0.9739251288579146
MSE: 0.07241783666299577
nRMSE: 0.029900624846478114
Execution Time (Raw): 117.68722796440125
Normalized Time (s/MB): 0.569282678146207
Average CPU Usage: 58.75
Sample Size: 0.5
------------------------

In [7]:
# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Calculate total metrics
total_avg_cpu_usage = total_cpu_usage / len(sample_sizes)
normalized_total_time = total_execution_time / total_memory_usage_MB

# Convert total execution time to minutes and seconds
total_minutes = int(total_execution_time // 60)
total_seconds = total_execution_time % 60

# Display total metrics
print(f"Total Execution Time for Entire Process (Raw): {total_minutes} minutes and {total_seconds:.2f} seconds")
print(f"Total Normalized Execution Time for Entire Process: {normalized_total_time:.8f} seconds per MB")
print(f"Total Average CPU Usage for Entire Process: {total_avg_cpu_usage:.2f}%")

# Display the metrics DataFrame
metrics_df


Total Execution Time for Entire Process (Raw): 7 minutes and 29.80 seconds
Total Normalized Execution Time for Entire Process: 0.57598711 seconds per MB
Total Average CPU Usage for Entire Process: 64.75%


Unnamed: 0,RMSE,MAPE,R2,MSE,nRMSE,Execution Time (Raw),Normalized Time (s/MB),Average CPU Usage,Sample Size
0,0.252105,0.010311,0.977131,0.063557,0.028012,233.953056,0.566411,57.6,1.0
1,0.269106,0.011414,0.973925,0.072418,0.029901,117.687228,0.569283,58.75,0.5
2,0.269354,0.01147,0.973877,0.072551,0.029928,57.906301,0.560215,65.95,0.25
3,0.269642,0.011548,0.973821,0.072707,0.02996,32.376473,0.626453,65.1,0.125
4,1.223396,0.179537,0.656721,1.496697,0.152924,0.410145,74.664583,71.8,100.0
5,0.277351,0.022756,0.971765,0.076924,0.030817,0.623604,11.352365,68.3,1000.0
6,0.316749,0.013647,0.966164,0.10033,0.035194,1.301034,2.36846,64.75,10000.0
7,0.263287,0.011603,0.975146,0.06932,0.029254,5.543799,1.009218,65.75,100000.0


another iteration

In [11]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np
import time
import psutil

# Function to calculate and return metrics
def calculate_metrics(X_train, X_test, y_train, y_test):
    lgbm = LGBMRegressor()

    # Define enhanced hyperparameters for RandomizedSearchCV
    param_distributions = {
        'n_estimators': [30, 50],  # Number of boosting stages
        'learning_rate': [0.05, 0.1],  # Step size shrinkage
        'max_depth': [3, 5, 7, 10, 15, 20, 30],  # Maximum depth of individual trees
        'min_child_samples': [20, 30, 40, 50],  # Minimum number of samples in a leaf
        'subsample': [0.8, 0.9, 1.0],  # Fraction of samples used for training
        'colsample_bytree': [0.8, 0.9, 1.0],  # Fraction of features used per tree
        'force_col_wise': [True],  # Force data to be stored column-wise
        'reg_alpha': [0, 0.01, 0.1, 1, 10],  # L1 regularization term
        'reg_lambda': [0, 0.01, 0.1, 1, 10]  # L2 regularization term
    }

    random_search = RandomizedSearchCV(
        lgbm,
        param_distributions,
        n_iter=10,
        cv=5,
        scoring='neg_mean_squared_error',
        random_state=42,
        n_jobs=-1
    )

    start_time = time.time()
    start_cpu = psutil.cpu_percent(interval=None)
    random_search.fit(X_train, y_train)
    end_cpu = psutil.cpu_percent(interval=None)
    end_time = time.time()

    # Calculate time and CPU usage
    execution_time = end_time - start_time
    avg_cpu_usage = (start_cpu + end_cpu) / 2

    y_pred = random_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Calculate the range of the target variable
    target_range = y_train.max() - y_train.min()

    # Calculate normalized RMSE (nRMSE)
    nrmse = rmse / target_range
    
    memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
    normalized_time = execution_time / memory_usage_MB
    
    return {
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2,
        'MSE': mse,
        'nRMSE': nrmse,  # Normalized RMSE
        'Execution Time (Raw)': execution_time,  # Raw execution time
        'Normalized Time (s/MB)': normalized_time,  # Normalized execution time
        'Average CPU Usage': avg_cpu_usage
    }


In [12]:
# Define sample sizes
sample_sizes = [1.0, 0.5, 0.25, 0.125, 100, 1000, 10000, 100000]

# Initialize metrics storage
metrics_list = []
total_execution_time = 0
total_cpu_usage = 0
total_memory_usage_MB = 0

# Loop through each sample size
for size in sample_sizes:
    try:
        X_train_sample, X_test_sample, y_train_sample, y_test_sample = sample_data(X, y, size)
        metrics = calculate_metrics(X_train_sample, X_test_sample, y_train_sample, y_test_sample)
        metrics['Sample Size'] = size
        metrics_list.append(metrics)

        # Accumulate total       
        total_execution_time += metrics['Execution Time (Raw)']
        total_cpu_usage += metrics['Average CPU Usage']
        total_memory_usage_MB += X_train_sample.memory_usage(deep=True).sum() / (1024 ** 2)

        print(f"Metrics for sample size {size}:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred for sample size {size}: {e}")


[LightGBM] [Info] Total Bins 1521
[LightGBM] [Info] Number of data points in the train set: 6015397, number of used features: 8
[LightGBM] [Info] Start training from score 7.619075
Metrics for sample size 1.0:
RMSE: 0.24837468284654893
MAPE: 0.008203456422378647
R2: 0.97780266925412
MSE: 0.06168998307912377
nRMSE: 0.02759718698294988
Execution Time (Raw): 242.77227187156677
Normalized Time (s/MB): 0.5877629471088941
Average CPU Usage: 50.599999999999994
Sample Size: 1.0
--------------------------------------------------
[LightGBM] [Info] Total Bins 1522
[LightGBM] [Info] Number of data points in the train set: 3010709, number of used features: 8
[LightGBM] [Info] Start training from score 7.618789
Metrics for sample size 0.5:
RMSE: 0.26749505980621047
MAPE: 0.009344220290541149
R2: 0.9742363046344593
MSE: 0.0715536070207281
nRMSE: 0.029721673311801162
Execution Time (Raw): 133.92271041870117
Normalized Time (s/MB): 0.6478177842273476
Average CPU Usage: 65.35
Sample Size: 0.5
----------

In [13]:
# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Calculate total metrics
total_avg_cpu_usage = total_cpu_usage / len(sample_sizes)
normalized_total_time = total_execution_time / total_memory_usage_MB

# Convert total execution time to minutes and seconds
total_minutes = int(total_execution_time // 60)
total_seconds = total_execution_time % 60

# Display total metrics
print(f"Total Execution Time for Entire Process (Raw): {total_minutes} minutes and {total_seconds:.2f} seconds")
print(f"Total Normalized Execution Time for Entire Process: {normalized_total_time:.8f} seconds per MB")
print(f"Total Average CPU Usage for Entire Process: {total_avg_cpu_usage:.2f}%")

# Display the metrics DataFrame
metrics_df


Total Execution Time for Entire Process (Raw): 8 minutes and 30.33 seconds
Total Normalized Execution Time for Entire Process: 0.65350010 seconds per MB
Total Average CPU Usage for Entire Process: 69.64%


Unnamed: 0,RMSE,MAPE,R2,MSE,nRMSE,Execution Time (Raw),Normalized Time (s/MB),Average CPU Usage,Sample Size
0,0.248375,0.008203,0.977803,0.06169,0.027597,242.772272,0.587763,50.6,1.0
1,0.267495,0.009344,0.974236,0.071554,0.029722,133.92271,0.647818,65.35,0.5
2,0.267937,0.009336,0.974151,0.07179,0.029771,79.459627,0.768733,70.3,0.25
3,0.268451,0.009383,0.974052,0.072066,0.029828,42.700078,0.826204,77.8,0.125
4,1.277463,0.188066,0.625709,1.631911,0.159683,0.335309,61.041102,75.3,100.0
5,0.291526,0.020122,0.968805,0.084987,0.032392,0.826376,15.043711,72.95,1000.0
6,0.316882,0.010936,0.966136,0.100414,0.035209,1.880757,3.423814,76.1,10000.0
7,0.264041,0.011765,0.975004,0.069718,0.029338,8.436198,1.535763,68.7,100000.0
