In [1]:
import pandas as pd

# Function to downcast numeric columns
def downcast(df):
    for col in df.select_dtypes(include=['int']).columns:
        df.loc[:, col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include=['float']).columns:
        df.loc[:, col] = pd.to_numeric(df[col], downcast='float')
    return df

# Load the datasets with only the required columns
df1 = pd.read_csv('C:/Users/Guest01/Documents/dataset_anime/archive/final_animedataset.csv', usecols=['user_id', 'anime_id', 'my_score', 'score', 'scored_by', 'gender'])
df2 = pd.read_csv('C:/Users/Guest01/Documents/dataset_anime/archive/users-score-2023.csv', usecols=['user_id', 'anime_id', 'Anime Title', 'rating'])

# Merge the datasets on user_id and anime_id
merged_df = pd.merge(df2, df1, on=['user_id', 'anime_id'])

# Select the necessary columns for the final DataFrame
final_df = merged_df[['user_id', 'anime_id', 'my_score', 'score', 'scored_by', 'gender', 'Anime Title', 'rating']]

# Downcast numeric columns
final_df = downcast(final_df)

# Display the first few rows to check the data
print(final_df.head())


   user_id  anime_id  my_score  score  scored_by gender  \
0        1        21         9   8.54     423868   Male   
1        1        48         7   7.09      61485   Male   
2        1       320         5   6.66      18934   Male   
3        1        49         8   7.38      20930   Male   
4        1       304         8   7.63      18571   Male   

              Anime Title  rating  
0               One Piece       9  
1             .hack//Sign       7  
2                  A Kite       5  
3        Aa! Megami-sama!       8  
4  Aa! Megami-sama! Movie       8  


In [3]:
from category_encoders import TargetEncoder   

# Handle missing values - example with 'score'
final_df['score'].fillna(final_df['score'].mean(), inplace=True)

# Define the target variable and categorical columns
target_column = 'rating'
categorical_columns = ['gender', 'Anime Title']

# Create and fit the target encoder
target_encoder = TargetEncoder(cols=categorical_columns)
final_df[categorical_columns] = target_encoder.fit_transform(final_df[categorical_columns], final_df[target_column])

# Drop any remaining rows with missing values if necessary
final_df = final_df.dropna()

# Display the first few rows after preprocessing and target encoding
print(final_df.head())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df['score'].fillna(final_df['score'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['score'].fillna(final_df['score'].mean(), inplace=True)


   user_id  anime_id  my_score  score  scored_by    gender  Anime Title  \
0        1        21         9   8.54     423868  7.590573     8.596731   
1        1        48         7   7.09      61485  7.590573     7.185359   
2        1       320         5   6.66      18934  7.590573     6.857294   
3        1        49         8   7.38      20930  7.590573     7.515523   
4        1       304         8   7.63      18571  7.590573     7.732777   

   rating  
0       9  
1       7  
2       5  
3       8  
4       8  


In [4]:
# Define features (X) and target (y)
X = final_df.drop(columns=[target_column])
y = final_df[target_column]

# Display the shapes of X and y to confirm
print(f'X shape: {X.shape}, y shape: {y.shape}')


X shape: (6021419, 7), y shape: (6021419,)


In [5]:
from sklearn.model_selection import train_test_split

# Function to sample data
def sample_data(X, y, sample_size):
    if isinstance(sample_size, float):
        if 0 < sample_size < 1.0:
            return train_test_split(X, y, test_size=0.2, train_size=sample_size, random_state=42)
        elif sample_size == 1.0:
            return train_test_split(X, y, test_size=0.001, random_state=42)
        else:
            raise ValueError("sample_size as float must be in the range (0.0, 1.0) or equal to 1.0.")
    elif isinstance(sample_size, int):
        if sample_size > len(X):
            raise ValueError(f"sample_size {sample_size} exceeds the number of available samples {len(X)}.")
        sampled_X = X.sample(n=sample_size, random_state=42)
        sampled_y = y.loc[sampled_X.index]
        return train_test_split(sampled_X, sampled_y, test_size=0.2, random_state=42)
    else:
        raise ValueError("sample_size must be a float or an integer")


In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np
import time
import psutil

# Function to calculate and return metrics
def calculate_metrics(X_train, X_test, y_train, y_test):
    lr = LinearRegression()

    # Define hyperparameters for RandomizedSearchCV
    param_distributions = {
        'fit_intercept': [True, False]
    }

    random_search = RandomizedSearchCV(lr, param_distributions, n_iter=2, cv=10, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)

    start_time = time.time()
    start_cpu = psutil.cpu_percent(interval=None)
    random_search.fit(X_train, y_train)
    end_cpu = psutil.cpu_percent(interval=None)
    end_time = time.time()

    # Calculate time and CPU usage
    execution_time = end_time - start_time
    avg_cpu_usage = (start_cpu + end_cpu) / 2

    y_pred = random_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Calculate the range of the target variable
    target_range = y_train.max() - y_train.min()

    # Calculate normalized RMSE (nRMSE)
    nrmse = rmse / target_range
    
    memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
    normalized_time = execution_time / memory_usage_MB
    
    return {
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2,
        'MSE': mse,
        'nRMSE': nrmse,  # Normalized RMSE
        'Execution Time (Raw)': execution_time,  # Raw execution time
        'Normalized Time (s/MB)': normalized_time,  # Normalized execution time
        'Average CPU Usage': avg_cpu_usage
    }


In [17]:
# Define sample sizes
sample_sizes = [1.0, 0.5, 0.25, 0.125, 100, 1000, 10000, 100000]

# Initialize metrics storage
metrics_list = []
total_execution_time = 0
total_cpu_usage = 0
total_memory_usage_MB = 0

# Loop through each sample size
for size in sample_sizes:
    try:
        X_train_sample, X_test_sample, y_train_sample, y_test_sample = sample_data(X, y, size)
        metrics = calculate_metrics(X_train_sample, X_test_sample, y_train_sample, y_test_sample)
        metrics['Sample Size'] = size
        metrics_list.append(metrics)

        # Accumulate total       
        total_execution_time += metrics['Execution Time (Raw)']
        total_cpu_usage += metrics['Average CPU Usage']
        total_memory_usage_MB += X_train_sample.memory_usage(deep=True).sum() / (1024 ** 2)

        print(f"Metrics for sample size {size}:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred for sample size {size}: {e}")


Metrics for sample size 1.0:
RMSE: 0.7505799728330512
MAPE: 0.05866523534479733
R2: 0.7972877254934723
MSE: 0.5633702956180638
nRMSE: 0.08339777475922791
Execution Time (Raw): 16.839823246002197
Normalized Time (s/MB): 0.045866243585003616
Average CPU Usage: 53.400000000000006
Sample Size: 1.0
--------------------------------------------------
Metrics for sample size 0.5:
RMSE: 0.7313924397731141
MAPE: 0.059349117034842004
R2: 0.8073905648856209
MSE: 0.5349349009572684
nRMSE: 0.08126582664145712
Execution Time (Raw): 7.707000255584717
Normalized Time (s/MB): 0.04194078278156408
Average CPU Usage: 52.6
Sample Size: 0.5
--------------------------------------------------
Metrics for sample size 0.25:
RMSE: 0.73139176284671
MAPE: 0.05953974862172301
R2: 0.8073909214174767
MSE: 0.5349339107600182
nRMSE: 0.08126575142741223
Execution Time (Raw): 3.9185242652893066
Normalized Time (s/MB): 0.042648507635081184
Average CPU Usage: 55.6
Sample Size: 0.25
------------------------------------------

In [18]:
# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Calculate total metrics
total_avg_cpu_usage = total_cpu_usage / len(sample_sizes)
normalized_total_time = total_execution_time / total_memory_usage_MB

# Convert total execution time to minutes and seconds
total_minutes = int(total_execution_time // 60)
total_seconds = total_execution_time % 60

# Display total metrics
print(f"Total Execution Time for Entire Process (Raw): {total_minutes} minutes and {total_seconds:.2f} seconds")
print(f"Total Normalized Execution Time for Entire Process: {normalized_total_time:.8f} seconds per MB")
print(f"Total Average CPU Usage for Entire Process: {total_avg_cpu_usage:.2f}%")

# Display the metrics DataFrame
metrics_df


Total Execution Time for Entire Process (Raw): 0 minutes and 31.79 seconds
Total Normalized Execution Time for Entire Process: 0.04579877 seconds per MB
Total Average CPU Usage for Entire Process: 53.36%


Unnamed: 0,RMSE,MAPE,R2,MSE,nRMSE,Execution Time (Raw),Normalized Time (s/MB),Average CPU Usage,Sample Size
0,0.75058,0.058665,0.797288,0.56337,0.083398,16.839823,0.045866,53.4,1.0
1,0.731392,0.059349,0.807391,0.534935,0.081266,7.707,0.041941,52.6,0.5
2,0.731392,0.05954,0.807391,0.534934,0.081266,3.918524,0.042649,55.6,0.25
3,0.731399,0.05979,0.807387,0.534945,0.081267,2.481337,0.054013,71.9,0.125
4,0.02063,0.002194,0.999902,0.000426,0.002579,0.058776,12.037256,53.0,100.0
5,0.605368,0.046371,0.865484,0.366471,0.067263,0.04755,0.973823,33.15,1000.0
6,0.818694,0.072795,0.773958,0.670259,0.090966,0.086307,0.176756,60.3,10000.0
7,0.743373,0.061921,0.801872,0.552603,0.082597,0.652078,0.133546,46.9,100000.0
