In [1]:
import pandas as pd

# Load the dataset
df_sampled = pd.read_csv("C:/Users/Guest01/Documents/Manpreet_thesis/Datasets/Glassdoor_job_reviews2/all_reviews.csv")

# Downcast numerical columns to save memory
def downcast(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype == "int64"]
    df[float_cols] = df[float_cols].astype("float32")
    df[int_cols] = df[int_cols].astype("int32")
    return df

df_sampled = downcast(df_sampled)
df_sampled.head()


KeyboardInterrupt: 

In [2]:
# List of relevant features
relevant_features = [
    'pros', 'cons', 'Recommend', 'Career Opportunities', 'Compensation and Benefits', 
    'Senior Management', 'Work/Life Balance', 'Culture & Values', 'Diversity & Inclusion', 
    'job', 'status'
]

# Fill null values with mode for categorical columns and median for numerical columns
def fill_nulls(df, features):
    for column in features:
        if df[column].dtype == 'object':
            df[column] = df[column].fillna(df[column].mode()[0])
        else:
            df[column] = df[column].fillna(df[column].median())
    return df

df_sampled = fill_nulls(df_sampled, relevant_features + ['rating'])  # Apply to relevant features and 'rating'

# Check for any remaining null values in relevant features
df_sampled[relevant_features + ['rating']].isnull().sum()


pros                         0
cons                         0
Recommend                    0
Career Opportunities         0
Compensation and Benefits    0
Senior Management            0
Work/Life Balance            0
Culture & Values             0
Diversity & Inclusion        0
job                          0
status                       0
rating                       0
dtype: int64

In [3]:
# Frequency encoding for categorical variables
def frequency_encoding(df, columns):
    df_copy = df.copy()  # Create a copy of the DataFrame to avoid the warning
    for column in columns:
        # Generate the frequency encoding (i.e., percentage frequency of each unique value)
        freq = df_copy[column].value_counts() / len(df_copy)
        # Map the original column with its frequency encoding
        df_copy[column] = df_copy[column].map(freq).astype('float32')  # Explicitly cast to float32 to avoid dtype issues
    return df_copy

# Apply frequency encoding to the relevant non-numeric columns
df_sampled = frequency_encoding(df_sampled, relevant_features)

# Downcast again to save memory
df_sampled = downcast(df_sampled)


In [4]:
# Define X and y
X = df_sampled[relevant_features]  # Use only relevant features for X
y = df_sampled['rating']  # Target variable

# Check for non-numeric columns in X
non_numeric_columns_in_X = X.select_dtypes(include=['object', 'category']).columns
print("Non-numeric columns in X:", non_numeric_columns_in_X)

# Print the shapes of X and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Non-numeric columns in X: Index([], dtype='object')
Shape of X: (9901889, 11)
Shape of y: (9901889,)


In [5]:
from sklearn.model_selection import train_test_split

# Function to sample data
def sample_data(X, y, sample_size):
    if isinstance(sample_size, float):
        if 0 < sample_size < 1.0:
            return train_test_split(X, y, test_size=0.2, train_size=sample_size, random_state=42)
        elif sample_size == 1.0:
            return train_test_split(X, y, test_size=0.2, train_size=None, random_state=42)  # Fix for 1.0 sample size
        else:
            raise ValueError("sample_size as float must be in the range (0.0, 1.0) or equal to 1.0.")
    elif isinstance(sample_size, int):
        if sample_size > len(X):
            raise ValueError(f"sample_size {sample_size} exceeds the number of available samples {len(X)}.")
        sampled_X = X.sample(n=sample_size, random_state=42)
        sampled_y = y.loc[sampled_X.index]
        return train_test_split(sampled_X, sampled_y, test_size=0.2, random_state=42)
    else:
        raise ValueError("sample_size must be a float or an integer")


In [6]:
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score, mean_absolute_error, explained_variance_score, max_error
# import numpy as np
# import time
# import psutil

# # Function to calculate and return metrics for Gradient Boosting Regressor
# def calculate_metrics(X_train, X_test, y_train, y_test):
#     gbr = GradientBoostingRegressor(random_state=42)

#     # Hyperparameter search space
#     param_distributions = {
#         'n_estimators': [50, 100, 200],  # Number of boosting stages
#         'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Learning rate shrinks contribution of each tree
#         'max_depth': [3, 5, 7, 9],  # Maximum depth of the individual regression estimators
#         'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
#         'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at a leaf node
#         'subsample': [0.7, 0.8, 1.0],  # Fraction of samples used for fitting the individual base learners
#         'max_features': ['sqrt', 'log2', None],  # Number of features to consider for each split
#     }

#     random_search = RandomizedSearchCV(gbr, param_distributions, n_iter=20, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)

#     start_time = time.time()
#     start_cpu = psutil.cpu_percent(interval=None)
#     random_search.fit(X_train, y_train)
#     end_cpu = psutil.cpu_percent(interval=None)
#     end_time = time.time()

#     # Calculate time and CPU usage
#     execution_time = end_time - start_time
#     avg_cpu_usage = (start_cpu + end_cpu) / 2

#     # Make predictions
#     y_pred = random_search.predict(X_test)

#     # Calculate performance metrics
#     rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#     mape = mean_absolute_percentage_error(y_test, y_pred)
#     r2 = r2_score(y_test, y_pred)
#     mse = mean_squared_error(y_test, y_pred)
#     mae = mean_absolute_error(y_test, y_pred)
#     max_err = max_error(y_test, y_pred)
#     explained_var = explained_variance_score(y_test, y_pred)

#     # Calculate the range of the target variable
#     target_range = y_train.max() - y_train.min()

#     # Calculate normalized RMSE (nRMSE)
#     nrmse = rmse / target_range

#     memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
#     normalized_time = execution_time / memory_usage_MB
    
#     return {
#         'RMSE': rmse,
#         'nRMSE': nrmse,  # Normalized RMSE
#         'MAPE': mape,
#         'R2': r2,
#         'MSE': mse,
#         'MAE': mae,
#         'Max Error': max_err,
#         'Explained Variance': explained_var,
#         'Execution Time (Raw)': execution_time,  # Raw execution time
#         'Normalized Time (s/MB)': normalized_time,  # Normalized execution time
#         'Average CPU Usage': avg_cpu_usage
#     }


In [7]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np
import time
import psutil

# Function to calculate and return metrics for Gradient Boosting Regressor
def calculate_metrics(X_train, X_test, y_train, y_test):
    gb = GradientBoostingRegressor(random_state=42)

    # Define hyperparameters for RandomizedSearchCV
    param_distributions = {
        'n_estimators': [50, 75],  # Number of boosting stages
        'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage
        'max_depth': [3, 5, 7],  # Maximum depth of individual estimators
        'min_samples_split': [5, 10],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4],  # Minimum number of samples in a leaf node
        'subsample': [0.8, 0.9, 1.0],  # Fraction of samples to fit individual base learners
        'max_features': ['sqrt', 'log2']  # Features to consider for best split
    }

    random_search = RandomizedSearchCV(gb, param_distributions, n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)

    start_time = time.time()
    start_cpu = psutil.cpu_percent(interval=None)
    random_search.fit(X_train, y_train)
    end_cpu = psutil.cpu_percent(interval=None)
    end_time = time.time()

    # Calculate time and CPU usage
    execution_time = end_time - start_time
    avg_cpu_usage = (start_cpu + end_cpu) / 2

    y_pred = random_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Calculate the range of the target variable
    target_range = y_train.max() - y_train.min()

    # Calculate normalized RMSE (nRMSE)
    nrmse = rmse / target_range
    
    memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
    normalized_time = execution_time / memory_usage_MB
    
    return {
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2,
        'nRMSE': nrmse,  # Normalized RMSE
        'Execution Time (Raw)': execution_time,  # Raw execution time
        'Normalized Time (s/MB)': normalized_time,  # Normalized execution time
        'Average CPU Usage': avg_cpu_usage
    }



In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np
import time
import psutil

# Function to calculate and return metrics for Gradient Boosting Regressor
def calculate_metrics(X_train, X_test, y_train, y_test):
    gb = GradientBoostingRegressor(random_state=42)

    # Optimized hyperparameters for faster execution
    # param_distributions = {
    #     'n_estimators': [30, 50],  # Reduce number of boosting stages
    #     'learning_rate': [0.01, 0.1],  # Higher learning rate to reduce need for trees
    #     'max_depth': [5, 10, 20],  # Shallower trees for faster training
    #     'min_samples_split': [2, 5, 10],  # Restrict splits to speed up
    #     'min_samples_leaf': [1, 2, 4],  # Speed up with slightly higher min samples
    #     'subsample': [0.9,1.0],  # No subsampling to avoid extra randomness
    #     'max_features': ['sqrt', 'log2']  # Use square root of features to reduce splits
    # }

    param_distributions = {
    'n_estimators': [30, 40],  # Further reduced number of trees
    'learning_rate': [0.05, 0.1],  # Balance learning rate for fewer trees
    'max_depth': [3, 5],  # Shallower trees for faster training
    'min_samples_split': [2, 5],  # More restrictive splits
    'min_samples_leaf': [1, 2],  # Fewer samples per leaf
    'subsample': [1.0],  # No subsampling for faster processing
    'max_features': ['sqrt', 'log2']  # Use sqrt for faster splits
}

    random_search = RandomizedSearchCV(gb, param_distributions, n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)

    start_time = time.time()
    start_cpu = psutil.cpu_percent(interval=None)
    random_search.fit(X_train, y_train)
    end_cpu = psutil.cpu_percent(interval=None)
    end_time = time.time()

    # Calculate time and CPU usage
    execution_time = end_time - start_time
    avg_cpu_usage = (start_cpu + end_cpu) / 2

    y_pred = random_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Calculate the range of the target variable
    target_range = y_train.max() - y_train.min()

    # Calculate normalized RMSE (nRMSE)
    nrmse = rmse / target_range
    
    memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
    normalized_time = execution_time / memory_usage_MB
    
    return {
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2,
        'nRMSE': nrmse,  # Normalized RMSE
        'Execution Time (Raw)': execution_time,  # Raw execution time
        'Normalized Time (s/MB)': normalized_time,  # Normalized execution time
        'Average CPU Usage': avg_cpu_usage
    }


In [9]:
import gc  # Garbage Collector

# Define sample sizes
sample_sizes = [1.0, 0.5, 0.25, 0.125, 100, 1000, 10000, 100000]  

# Initialize metrics storage
metrics_list = []
total_execution_time = 0
total_cpu_usage = 0
total_memory_usage_MB = 0

# Loop through each sample size
for size in sample_sizes:
    try:
        # Sample data based on the defined sizes
        X_train_sample, X_test_sample, y_train_sample, y_test_sample = sample_data(X, y, size)
        
        # Calculate metrics
        metrics = calculate_metrics(X_train_sample, X_test_sample, y_train_sample, y_test_sample)
        metrics['Sample Size'] = size
        metrics_list.append(metrics)

        # Call garbage collection after each iteration to free up memory
        gc.collect()

        # Accumulate total metrics 
        total_execution_time += metrics['Execution Time (Raw)']
        total_cpu_usage += metrics['Average CPU Usage']
        total_memory_usage_MB += X_train_sample.memory_usage(deep=True).sum() / (1024 ** 2)

        print(f"Metrics for sample size {size}:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred for sample size {size}: {e}")


Metrics for sample size 1.0:
RMSE: 0.6897360649783185
MAPE: 0.1933962448203518
R2: 0.6953648646840398
nRMSE: 0.17243401624457963
Execution Time (Raw): 5020.854091882706
Normalized Time (s/MB): 12.781038433610611
Average CPU Usage: 48.05
Sample Size: 1.0
--------------------------------------------------
Metrics for sample size 0.5:
RMSE: 0.695754906677255
MAPE: 0.20287858917686064
R2: 0.690024993963559
nRMSE: 0.17393872666931376
Execution Time (Raw): 3184.7221128940582
Normalized Time (s/MB): 12.971198306400682
Average CPU Usage: 48.1
Sample Size: 0.5
--------------------------------------------------
Metrics for sample size 0.25:
RMSE: 0.7008907869531781
MAPE: 0.2045973476889929
R2: 0.685431794987204
nRMSE: 0.17522269673829452
Execution Time (Raw): 1355.220914363861
Normalized Time (s/MB): 11.039480749685158
Average CPU Usage: 49.55
Sample Size: 0.25
--------------------------------------------------
Metrics for sample size 0.125:
RMSE: 0.7061873831334081
MAPE: 0.20651296292990953
R2:

In [10]:
# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Calculate total metrics
total_avg_cpu_usage = total_cpu_usage / len(sample_sizes)
normalized_total_time = total_execution_time / total_memory_usage_MB

# Convert total execution time to minutes and seconds
total_minutes = int(total_execution_time // 60)
total_seconds = total_execution_time % 60

# Display total metrics
print(f"Total Execution Time for Entire Process (Raw): {total_minutes} minutes and {total_seconds:.2f} seconds")
print(f"Total Normalized Execution Time for Entire Process: {normalized_total_time:.8f} seconds per MB")
print(f"Total Average CPU Usage for Entire Process: {total_avg_cpu_usage:.2f}%")

# Display the metrics DataFrame
metrics_df


Total Execution Time for Entire Process (Raw): 169 minutes and 14.42 seconds
Total Normalized Execution Time for Entire Process: 12.27998731 seconds per MB
Total Average CPU Usage for Entire Process: 44.36%


Unnamed: 0,RMSE,MAPE,R2,nRMSE,Execution Time (Raw),Normalized Time (s/MB),Average CPU Usage,Sample Size
0,0.689736,0.193396,0.695365,0.172434,5020.854092,12.781038,48.05,1.0
1,0.695755,0.202879,0.690025,0.173939,3184.722113,12.971198,48.1,0.5
2,0.700891,0.204597,0.685432,0.175223,1355.220914,11.039481,49.55,0.25
3,0.706187,0.206513,0.680659,0.176547,576.55462,9.393101,50.6,0.125
4,0.514232,0.153798,0.824878,0.128558,0.350004,88.222536,27.55,100.0
5,0.796078,0.238385,0.612415,0.19902,0.46719,11.776052,33.55,1000.0
6,0.741432,0.214705,0.633311,0.185358,1.302461,3.283004,47.95,10000.0
7,0.722657,0.213592,0.669737,0.180664,14.952487,3.768947,49.55,100000.0


another iteraton
