In [1]:
import pandas as pd

# Load the dataset with only the necessary columns
selected_columns = ['Severity', 'Description', 'Street', 'City', 'State', 
                    'Country', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
                    'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 
                    'Wind_Speed(mph)', 'Weather_Condition']

df = pd.read_csv('E:/Master_thesis_REV/Datasets/Dataset_US_accidents/archive/US_Accidents_March23.csv', usecols=selected_columns)

# Display the first few rows to check the data
print(df.head())


   Severity                                        Description  \
0         3  Right lane blocked due to accident on I-70 Eas...   
1         2  Accident on Brice Rd at Tussing Rd. Expect del...   
2         2  Accident on OH-32 State Route 32 Westbound at ...   
3         3  Accident on I-75 Southbound at Exits 52 52B US...   
4         2  Accident on McEwen Rd at OH-725 Miamisburg Cen...   

                      Street          City State Country  Temperature(F)  \
0                     I-70 E        Dayton    OH      US            36.9   
1                   Brice Rd  Reynoldsburg    OH      US            37.9   
2             State Route 32  Williamsburg    OH      US            36.0   
3                     I-75 S        Dayton    OH      US            35.1   
4  Miamisburg Centerville Rd        Dayton    OH      US            36.0   

   Wind_Chill(F)  Humidity(%)  Pressure(in)  Visibility(mi) Wind_Direction  \
0            NaN         91.0         29.68            10.0         

In [2]:
# Handle missing values - example with 'Wind_Speed(mph)'
df['Wind_Speed(mph)'].fillna(df['Wind_Speed(mph)'].mean(), inplace=True)

# Frequency encoding for categorical columns
categorical_columns = ['Description', 'Street', 'City', 'State', 'Country', 'Weather_Condition', 'Wind_Direction']

for col in categorical_columns:
    freq = df[col].value_counts()
    df[col] = df[col].map(freq)

# Drop any remaining rows with missing values if necessary
df = df.dropna()

# Display the first few rows after preprocessing
print(df.head())


   Severity  Description   Street     City   State  Country  Temperature(F)  \
2         2          1.0    252.0   3010.0  118115  7728394            36.0   
3         3          5.0  27546.0  24572.0  118115  7728394            35.1   
4         2          2.0    729.0  24572.0  118115  7728394            36.0   
5         3          1.0    104.0   1073.0  118115  7728394            37.9   
6         2          1.0     26.0  24572.0  118115  7728394            34.0   

   Wind_Chill(F)  Humidity(%)  Pressure(in)  Visibility(mi)  Wind_Direction  \
2           33.3        100.0         29.67            10.0        364470.0   
3           31.0         96.0         29.64             9.0        364470.0   
4           33.3         89.0         29.65             6.0        364470.0   
5           35.5         97.0         29.63             7.0        384840.0   
6           31.0        100.0         29.66             7.0        353806.0   

   Wind_Speed(mph)  Weather_Condition  
2         

In [3]:
# Define features (X) and target (y)
X = df.drop(columns=['Severity'])
y = df['Severity']

# Display the shapes of X and y to confirm
print(f'X shape: {X.shape}, y shape: {y.shape}')


X shape: (5676313, 13), y shape: (5676313,)


In [4]:
from sklearn.model_selection import train_test_split

# Function to sample data
def sample_data(X, y, sample_size):
    if isinstance(sample_size, float):
        if 0 < sample_size < 1.0:
            return train_test_split(X, y, test_size=0.2, train_size=sample_size, random_state=42)
        elif sample_size == 1.0:
            return train_test_split(X, y, test_size=0.001, random_state=42)
        else:
            raise ValueError("sample_size as float must be in the range (0.0, 1.0) or equal to 1.0.")
    elif isinstance(sample_size, int):
        if sample_size > len(X):
            raise ValueError(f"sample_size {sample_size} exceeds the number of available samples {len(X)}.")
        sampled_X = X.sample(n=sample_size, random_state=42)
        sampled_y = y.loc[sampled_X.index]
        return train_test_split(sampled_X, sampled_y, test_size=0.2, random_state=42)
    else:
        raise ValueError("sample_size must be a float or an integer")


In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np
import time
import psutil
import pandas as pd

# Function to calculate and return metrics
def calculate_metrics(X_train, X_test, y_train, y_test):
    dt = DecisionTreeRegressor(random_state=42)

    # Define hyperparameters for RandomizedSearchCV
    param_distributions = {
        'max_depth': [10, 20, 30],  # Ideal max_depth range
        'min_samples_split': [2, 5, 10],  # Keep this parameter simple for speed
        'min_samples_leaf': [5, 10, 20],  # Adjust based on data complexity
        'max_features': ['sqrt', 'log2', None]  # Common choices for trees
    }

    random_search = RandomizedSearchCV(dt, param_distributions, n_iter=20, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)

    start_time = time.time()
    start_cpu = psutil.cpu_percent(interval=None)
    random_search.fit(X_train, y_train)
    end_cpu = psutil.cpu_percent(interval=None)
    end_time = time.time()

    # Calculate time and CPU usage
    execution_time = end_time - start_time
    avg_cpu_usage = (start_cpu + end_cpu) / 2

    y_pred = random_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Calculate the range of the target variable
    target_range = y_train.max() - y_train.min()

    # Calculate normalized RMSE (nRMSE)
    nrmse = rmse / target_range
    
    memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
    normalized_time = execution_time / memory_usage_MB
    
    return {
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2,
        'MSE': mse,
        'nRMSE': nrmse,  # Normalized RMSE
        'Execution Time (Raw)': execution_time,  # Raw execution time
        'Normalized Time (s/MB)': normalized_time,  # Normalized execution time
        'Average CPU Usage': avg_cpu_usage
    }


In [6]:
# Define sample sizes
sample_sizes = [1.0, 0.5, 0.25, 0.125, 100, 1000, 10000, 100000]

# Initialize metrics storage
metrics_list = []
total_execution_time = 0
total_cpu_usage = 0
total_memory_usage_MB = 0

# Loop through each sample size
for size in sample_sizes:
    try:
        X_train_sample, X_test_sample, y_train_sample, y_test_sample = sample_data(X, y, size)
        metrics = calculate_metrics(X_train_sample, X_test_sample, y_train_sample, y_test_sample)
        metrics['Sample Size'] = size
        metrics_list.append(metrics)

        # Accumulate total       
        total_execution_time += metrics['Execution Time (Raw)']
        total_cpu_usage += metrics['Average CPU Usage']
        total_memory_usage_MB += X_train_sample.memory_usage(deep=True).sum() / (1024 ** 2)

        print(f"Metrics for sample size {size}:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred for sample size {size}: {e}")


Metrics for sample size 1.0:
RMSE: 0.41340786768552873
MAPE: 0.09754959519728744
R2: 0.19613784048558403
MSE: 0.17090606506429565
nRMSE: 0.13780262256184292
Execution Time (Raw): 888.9641733169556
Normalized Time (s/MB): 1.4676901462829113
Average CPU Usage: 55.55
Sample Size: 1.0
--------------------------------------------------
Metrics for sample size 0.5:
RMSE: 0.4206149191065118
MAPE: 0.10581089692664486
R2: 0.15087834303349412
MSE: 0.17691691017497743
nRMSE: 0.14020497303550392
Execution Time (Raw): 295.3222212791443
Normalized Time (s/MB): 0.9741857083940619
Average CPU Usage: 50.8
Sample Size: 0.5
--------------------------------------------------
Metrics for sample size 0.25:
RMSE: 0.4209628158339028
MAPE: 0.10563651105285149
R2: 0.14947312034755877
MSE: 0.17720969231480838
nRMSE: 0.14032093861130093
Execution Time (Raw): 123.00475263595581
Normalized Time (s/MB): 0.8115168006218525
Average CPU Usage: 47.2
Sample Size: 0.25
--------------------------------------------------
Me

In [7]:
# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Calculate total metrics
total_avg_cpu_usage = total_cpu_usage / len(sample_sizes)
normalized_total_time = total_execution_time / total_memory_usage_MB

# Convert total execution time to minutes and seconds
total_minutes = int(total_execution_time // 60)
total_seconds = total_execution_time % 60

# Display total metrics
print(f"Total Execution Time for Entire Process (Raw): {total_minutes} minutes and {total_seconds:.2f} seconds")
print(f"Total Normalized Execution Time for Entire Process: {normalized_total_time:.8f} seconds per MB")
print(f"Total Average CPU Usage for Entire Process: {total_avg_cpu_usage:.2f}%")

# Display the metrics DataFrame
metrics_df


Total Execution Time for Entire Process (Raw): 22 minutes and 42.75 seconds
Total Normalized Execution Time for Entire Process: 1.18945699 seconds per MB
Total Average CPU Usage for Entire Process: 35.04%


Unnamed: 0,RMSE,MAPE,R2,MSE,nRMSE,Execution Time (Raw),Normalized Time (s/MB),Average CPU Usage,Sample Size
0,0.413408,0.09755,0.196138,0.170906,0.137803,888.964173,1.46769,55.55,1.0
1,0.420615,0.105811,0.150878,0.176917,0.140205,295.322221,0.974186,50.8,0.5
2,0.420963,0.105637,0.149473,0.17721,0.140321,123.004753,0.811517,47.2,0.25
3,0.422517,0.105934,0.143181,0.178521,0.140839,49.80689,0.657196,47.2,0.125
4,0.432834,0.086012,0.279443,0.187345,0.144278,0.153311,17.941769,2.85,100.0
5,0.495594,0.116363,-0.101527,0.245613,0.165198,0.149535,1.749989,8.2,1000.0
6,0.465382,0.129142,-0.010088,0.216581,0.155127,0.489382,0.572717,30.9,10000.0
7,0.431754,0.108204,0.089087,0.186411,0.143918,4.860228,0.568786,37.65,100000.0


In [12]:
# Function to calculate and return metrics
def calculate_metrics(X_train, X_test, y_train, y_test):
    dt = DecisionTreeRegressor(random_state=42)

    # Updated hyperparameters for RandomizedSearchCV
    param_distributions = {
        'max_depth': [10, 20, 30, 40, 50, 60],
        'min_samples_split': [5, 10, 15],
        'min_samples_leaf': [5, 10, 15],
        'max_features': ['sqrt', 'log2'],
        'max_leaf_nodes': [None, 50, 100, 200],
    }

    random_search = RandomizedSearchCV(dt, param_distributions, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)

    start_time = time.time()
    start_cpu = psutil.cpu_percent(interval=None)
    random_search.fit(X_train, y_train)
    end_cpu = psutil.cpu_percent(interval=None)
    end_time = time.time()

    # Calculate time and CPU usage
    execution_time = end_time - start_time
    avg_cpu_usage = (start_cpu + end_cpu) / 2

    y_pred = random_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Calculate the range of the target variable
    target_range = y_train.max() - y_train.min()

    # Calculate normalized RMSE (nRMSE)
    nrmse = rmse / target_range
    
    memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
    normalized_time = execution_time / memory_usage_MB
    
    return {
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2,
        'MSE': mse,
        'nRMSE': nrmse,
        'Execution Time (Raw)': execution_time,
        'Normalized Time (s/MB)': normalized_time,
        'Average CPU Usage': avg_cpu_usage
    }


In [13]:
# Define sample sizes
sample_sizes = [1.0, 0.5, 0.25, 0.125, 100, 1000, 10000, 100000]

# Initialize metrics storage
metrics_list = []
total_execution_time = 0
total_cpu_usage = 0
total_memory_usage_MB = 0

# Loop through each sample size
for size in sample_sizes:
    try:
        X_train_sample, X_test_sample, y_train_sample, y_test_sample = sample_data(X, y, size)
        metrics = calculate_metrics(X_train_sample, X_test_sample, y_train_sample, y_test_sample)
        metrics['Sample Size'] = size
        metrics_list.append(metrics)

        # Accumulate total       
        total_execution_time += metrics['Execution Time (Raw)']
        total_cpu_usage += metrics['Average CPU Usage']
        total_memory_usage_MB += X_train_sample.memory_usage(deep=True).sum() / (1024 ** 2)

        print(f"Metrics for sample size {size}:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred for sample size {size}: {e}")


Metrics for sample size 1.0:
RMSE: 0.42232455447931727
MAPE: 0.1059284561181799
R2: 0.16108728757589452
MSE: 0.1783580293161538
nRMSE: 0.14077485149310576
Execution Time (Raw): 788.2678451538086
Normalized Time (s/MB): 1.3014393422032882
Average CPU Usage: 48.65
Sample Size: 1.0
--------------------------------------------------


In [11]:
# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Calculate total metrics
total_avg_cpu_usage = total_cpu_usage / len(sample_sizes)
normalized_total_time = total_execution_time / total_memory_usage_MB

# Convert total execution time to minutes and seconds
total_minutes = int(total_execution_time // 60)
total_seconds = total_execution_time % 60

# Display total metrics
print(f"Total Execution Time for Entire Process (Raw): {total_minutes} minutes and {total_seconds:.2f} seconds")
print(f"Total Normalized Execution Time for Entire Process: {normalized_total_time:.8f} seconds per MB")
print(f"Total Average CPU Usage for Entire Process: {total_avg_cpu_usage:.2f}%")

# Display the metrics DataFrame
metrics_df


Total Execution Time for Entire Process (Raw): 34 minutes and 32.86 seconds
Total Normalized Execution Time for Entire Process: 1.80926411 seconds per MB
Total Average CPU Usage for Entire Process: 32.73%


Unnamed: 0,RMSE,MAPE,R2,MSE,nRMSE,Execution Time (Raw),Normalized Time (s/MB),Average CPU Usage,Sample Size
0,0.453627,0.125621,0.032121,0.205777,0.151209,1407.984422,2.324599,43.05,1.0
1,0.449099,0.122013,0.031978,0.20169,0.1497,454.422421,1.499013,46.7,0.5
2,0.4491,0.121837,0.031973,0.201691,0.1497,151.509041,0.999572,48.15,0.25
3,0.449099,0.121883,0.031977,0.20169,0.1497,52.878906,0.697731,50.0,0.125
4,0.510055,0.129427,-0.000601,0.260156,0.170018,0.177721,20.798465,15.3,100.0
5,0.475112,0.117725,-0.012361,0.225731,0.158371,0.222621,2.605299,15.6,1000.0
6,0.452715,0.130657,0.044152,0.204951,0.150905,0.53422,0.625189,20.85,10000.0
7,0.444224,0.121805,0.035705,0.197335,0.148075,5.128742,0.600209,22.2,100000.0
