In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import time
import psutil

In [2]:
df = pd.read_csv('E:/Master_thesis_REV/Datasets/Dataset_US_accidents/archive/US_Accidents_March23.csv')

In [3]:
df.head(15)

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day
5,A-6,Source2,3,2016-02-08 07:44:26,2016-02-08 08:14:26,40.10059,-82.925194,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day
6,A-7,Source2,2,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,,,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
7,A-8,Source2,3,2016-02-08 07:59:58,2016-02-08 08:29:58,39.770382,-84.194901,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day
8,A-9,Source2,2,2016-02-08 08:00:40,2016-02-08 08:30:40,39.778061,-84.172005,,,0.0,...,False,False,False,False,False,False,Day,Day,Day,Day
9,A-10,Source2,3,2016-02-08 08:10:04,2016-02-08 08:40:04,40.10059,-82.925194,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day


In [4]:
df.shape

(7728394, 46)

In [5]:
df.columns

Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [6]:
# Check the data types of the columns
print(df.dtypes)

# Inspect the unique values of the City column (which is likely the issue)
print(df['City'].unique()[:50])  # Display the first 50 unique cities to see what's going on


ID                        object
Source                    object
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
End_Lat                  float64
End_Lng                  float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Weather_Timestamp         object
Temperature(F)           float64
Wind_Chill(F)            float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Precipitation(in)        float64
Weather_Condition         object
Amenity                     bool
Bump      

In [7]:
# Handle missing values by filling or removing them
df = df.drop(columns=['ID', 'Description','Source', 'Airport_Code', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 
                      'Street', 'Amenity', 'Station', 'Traffic_Calming', 'Source'])

# Convert Start_Time and End_Time to datetime format
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
df['End_Time'] = pd.to_datetime(df['End_Time'], errors='coerce')

# Drop rows with NaT values in Start_Time or End_Time
df.dropna(subset=['Start_Time', 'End_Time'], inplace=True)

# Create a duration feature
df['Duration'] = (df['End_Time'] - df['Start_Time']).dt.total_seconds() / 3600.0  # duration in hours

# Drop the original End_Time column
df = df.drop(columns=['End_Time'])

# Display the updated DataFrame
df.head()


Unnamed: 0,Severity,Start_Time,Distance(mi),City,County,State,Zipcode,Country,Timezone,Weather_Timestamp,...,Railway,Roundabout,Stop,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Duration
0,3,2016-02-08 05:46:00,0.01,Dayton,Montgomery,OH,45424,US,US/Eastern,2016-02-08 05:58:00,...,False,False,False,False,False,Night,Night,Night,Night,5.233333
1,2,2016-02-08 06:07:59,0.01,Reynoldsburg,Franklin,OH,43068-3402,US,US/Eastern,2016-02-08 05:51:00,...,False,False,False,False,False,Night,Night,Night,Day,0.5
2,2,2016-02-08 06:49:27,0.01,Williamsburg,Clermont,OH,45176,US,US/Eastern,2016-02-08 06:56:00,...,False,False,False,True,False,Night,Night,Day,Day,0.5
3,3,2016-02-08 07:23:34,0.01,Dayton,Montgomery,OH,45417,US,US/Eastern,2016-02-08 07:38:00,...,False,False,False,False,False,Night,Day,Day,Day,0.5
4,2,2016-02-08 07:39:07,0.01,Dayton,Montgomery,OH,45459,US,US/Eastern,2016-02-08 07:53:00,...,False,False,False,True,False,Day,Day,Day,Day,0.5


In [8]:
df.columns

Index(['Severity', 'Start_Time', 'Distance(mi)', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Weather_Timestamp', 'Temperature(F)',
       'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Direction', 'Wind_Speed(mph)', 'Precipitation(in)',
       'Weather_Condition', 'Bump', 'Crossing', 'Give_Way', 'Junction',
       'No_Exit', 'Railway', 'Roundabout', 'Stop', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'Duration'],
      dtype='object')

Feature Engineering - Extracting Features from Start_Time and End_Time

Feature Engineering - Encoding Categorical Variables

In [9]:
# Extract time-related features
df['Hour'] = df['Start_Time'].dt.hour
df['Day'] = df['Start_Time'].dt.dayofweek
df['Month'] = df['Start_Time'].dt.month

# Drop Start_Time as we have extracted useful features from it
df = df.drop(columns=['Start_Time'])

# Handle categorical features using frequency encoding
categorical_columns = ['Weather_Condition', 'City', 'State', 'Timezone', 
                       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight',
                       'Bump', 'Crossing', 'Traffic_Signal', 'Junction', 'Give_Way', 'No_Exit', 
                       'Railway', 'Roundabout', 'Stop', 'Turning_Loop']

for col in categorical_columns:
    if df[col].dtype == 'object':  
        freq_encoding = df[col].value_counts().to_dict()
        df[col] = df[col].map(freq_encoding)

# # Display the DataFrame after encoding
# df.head()

# Check for any remaining non-numeric columns
print(df.select_dtypes(include=['object']).columns)


Index(['County', 'Zipcode', 'Country', 'Weather_Timestamp', 'Wind_Direction'], dtype='object')


In [10]:
# Drop any remaining non-numeric columns if necessary
remaining_non_numeric_cols = df.select_dtypes(include=['object']).columns
if len(remaining_non_numeric_cols) > 0:
    print(f"Dropping non-numeric columns: {remaining_non_numeric_cols}")
    df = df.drop(columns=remaining_non_numeric_cols)

Dropping non-numeric columns: Index(['County', 'Zipcode', 'Country', 'Weather_Timestamp', 'Wind_Direction'], dtype='object')


In [15]:
# Apply imputer to handle any remaining NaN values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

Define the Target Variable and Features

In [16]:
# Define the target variable (Severity)
y = df_imputed['Severity']

# Define the features (exclude the target variable)
X = df_imputed.drop(columns=['Severity'])

# Display the first few rows of the feature set and the target variable
X.head(), y.head()


(   Distance(mi)     City     State   Timezone  Temperature(F)  Wind_Chill(F)  \
 0          0.01  23300.0  110765.0  3204981.0            36.9      57.994739   
 1          0.01    419.0  110765.0  3204981.0            37.9      57.994739   
 2          0.01   2634.0  110765.0  3204981.0            36.0      33.300000   
 3          0.01  23300.0  110765.0  3204981.0            35.1      31.000000   
 4          0.01  23300.0  110765.0  3204981.0            36.0      33.300000   
 
    Humidity(%)  Pressure(in)  Visibility(mi)  Wind_Speed(mph)  ...  \
 0         91.0         29.68            10.0         7.696474  ...   
 1        100.0         29.65            10.0         7.696474  ...   
 2        100.0         29.67            10.0         3.500000  ...   
 3         96.0         29.64             9.0         4.600000  ...   
 4         89.0         29.65             6.0         3.500000  ...   
 
    Traffic_Signal  Turning_Loop  Sunrise_Sunset  Civil_Twilight  \
 0             0

Define the sample data function

In [17]:
from sklearn.model_selection import train_test_split

# Function to sample data (updated to handle full dataset properly)
def sample_data(X, y, sample_size):
    if isinstance(sample_size, float):
        if 0 < sample_size < 1.0:
            return train_test_split(X, y, test_size=0.2, train_size=sample_size, random_state=42)
        elif sample_size == 1.0:
            # Use the entire dataset for training, leave a small portion for testing
            return train_test_split(X, y, test_size=0.001, random_state=42)
        else:
            raise ValueError("sample_size as float must be in the range (0.0, 1.0) or equal to 1.0.")
    elif isinstance(sample_size, int):
        if sample_size > len(X):
            raise ValueError(f"sample_size {sample_size} exceeds the number of available samples {len(X)}.")
        sampled_X = X.sample(n=sample_size, random_state=42)
        sampled_y = y.loc[sampled_X.index]
        return train_test_split(sampled_X, sampled_y, test_size=0.2, random_state=42)
    else:
        raise ValueError("sample_size must be a float or an integer")


Define the Metrics Calculation Function and Initialize the Model

In [18]:
# Function to calculate and return metrics
def calculate_metrics(X_train, X_test, y_train, y_test):
    lr = LinearRegression()

    # Define hyperparameters for RandomizedSearchCV
    param_distributions = {
        'fit_intercept': [True, False]
    }

    random_search = RandomizedSearchCV(lr, param_distributions, n_iter=4, cv=5, scoring='neg_mean_squared_error', random_state=42)

    start_time = time.time()
    start_cpu = psutil.cpu_percent(interval=None)
    random_search.fit(X_train, y_train)
    end_cpu = psutil.cpu_percent(interval=None)
    end_time = time.time()

    # Calculate time and CPU usage
    execution_time = end_time - start_time
    avg_cpu_usage = (start_cpu + end_cpu) / 2

    y_pred = random_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Calculate the range of the target variable
    target_range = y_train.max() - y_train.min()

    # Calculate normalized RMSE (nRMSE)
    nrmse = rmse / target_range
    
    memory_usage_MB = X_train.memory_usage(deep=True).sum() / (1024 ** 2)
    normalized_time = execution_time / memory_usage_MB
    
    return {
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2,
        'MSE': mse,
        'nRMSE': nrmse,  # Normalized RMSE
        'Execution Time (Raw)': execution_time,  # Raw execution time
        'Normalized Time (s/MB)': normalized_time,  # Normalized execution time
        'Average CPU Usage': avg_cpu_usage
    }


Define Sample Sizes and Collect Metrics

In [19]:
# Define sample sizes
sample_sizes = [1.0, 0.5, 0.25, 0.125, 100, 1000, 10000, 100000]

# Initialize metrics storage
metrics_list = []
total_execution_time = 0
total_cpu_usage = 0
total_memory_usage_MB = 0

# Loop through each sample size
for size in sample_sizes:
    try:
        X_train_sample, X_test_sample, y_train_sample, y_test_sample = sample_data(X, y, size)
        metrics = calculate_metrics(X_train_sample, X_test_sample, y_train_sample, y_test_sample)
        metrics['Sample Size'] = size
        metrics_list.append(metrics)

        # Accumulate total metrics
        total_execution_time += metrics['Execution Time (Raw)']
        total_cpu_usage += metrics['Average CPU Usage']
        total_memory_usage_MB += X_train_sample.memory_usage(deep=True).sum() / (1024 ** 2)

        print(f"Metrics for sample size {size}:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred for sample size {size}: {e}")




Metrics for sample size 1.0:
RMSE: 0.48393918693772503
MAPE: 0.15409356550919331
R2: 0.06191208266593062
MSE: 0.2341971366539464
nRMSE: 0.161313062312575
Execution Time (Raw): 81.90079402923584
Normalized Time (s/MB): 0.04962383434108476
Average CPU Usage: 12.8
Sample Size: 1.0
--------------------------------------------------




Metrics for sample size 0.5:
RMSE: 0.4852104901125737
MAPE: 0.1527466464749888
R2: 0.05412369630931335
MSE: 0.23542921971528402
nRMSE: 0.16173683003752456
Execution Time (Raw): 37.856797218322754
Normalized Time (s/MB): 0.04582911921761087
Average CPU Usage: 11.100000000000001
Sample Size: 0.5
--------------------------------------------------




Metrics for sample size 0.25:
RMSE: 0.48521091655397214
MAPE: 0.1528350533926434
R2: 0.05412203368659951
MSE: 0.23542963354314572
nRMSE: 0.16173697218465738
Execution Time (Raw): 19.12330675125122
Normalized Time (s/MB): 0.046301027521359034
Average CPU Usage: 9.649999999999999
Sample Size: 0.25
--------------------------------------------------




Metrics for sample size 0.125:
RMSE: 0.4852147060744411
MAPE: 0.15307072425985024
R2: 0.054107258924575685
MSE: 0.23543331099090628
nRMSE: 0.16173823535814705
Execution Time (Raw): 9.315663814544678
Normalized Time (s/MB): 0.045109881806547375
Average CPU Usage: 10.4
Sample Size: 0.125
--------------------------------------------------




Metrics for sample size 100:
RMSE: 0.48099699624719233
MAPE: 0.18599467828519406
R2: -0.8145734148927182
MSE: 0.23135811039882154
nRMSE: 0.24049849812359617
Execution Time (Raw): 0.044446706771850586
Normalized Time (s/MB): 2.3490801411290323
Average CPU Usage: 9.45
Sample Size: 100
--------------------------------------------------




Metrics for sample size 1000:
RMSE: 0.5079673051783934
MAPE: 0.15314468170321974
R2: 0.04211310206886676
MSE: 0.258030783130199
nRMSE: 0.16932243505946445
Execution Time (Raw): 0.09656357765197754
Normalized Time (s/MB): 0.5103540826612903
Average CPU Usage: 6.8
Sample Size: 1000
--------------------------------------------------




Metrics for sample size 10000:
RMSE: 0.501422074892308
MAPE: 0.15364409347432684
R2: 0.053885191319429815
MSE: 0.25142409718930725
nRMSE: 0.1671406916307693
Execution Time (Raw): 0.14601421356201172
Normalized Time (s/MB): 0.07717086693548388
Average CPU Usage: 7.6000000000000005
Sample Size: 10000
--------------------------------------------------




Metrics for sample size 100000:
RMSE: 0.4823212300165399
MAPE: 0.15390147352740327
R2: 0.05397577370689555
MSE: 0.23263376892466803
nRMSE: 0.16077374333884664
Execution Time (Raw): 0.9086077213287354
Normalized Time (s/MB): 0.04802138356854839
Average CPU Usage: 11.25
Sample Size: 100000
--------------------------------------------------


Display the Total Metrics and the Metrics DataFrame

In [20]:
# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Calculate total metrics
total_avg_cpu_usage = total_cpu_usage / len(sample_sizes)
normalized_total_time = total_execution_time / total_memory_usage_MB

# Display total metrics
print(f"Total Execution Time for Entire Process (Raw): {total_execution_time} seconds")
print(f"Total Normalized Execution Time for Entire Process: {normalized_total_time} seconds per MB")
print(f"Total Average CPU Usage for Entire Process: {total_avg_cpu_usage}%")

# Display the metrics DataFrame
metrics_df


Total Execution Time for Entire Process (Raw): 149.39219403266907 seconds
Total Normalized Execution Time for Entire Process: 0.04792776985654194 seconds per MB
Total Average CPU Usage for Entire Process: 9.881249999999998%


Unnamed: 0,RMSE,MAPE,R2,MSE,nRMSE,Execution Time (Raw),Normalized Time (s/MB),Average CPU Usage,Sample Size
0,0.483939,0.154094,0.061912,0.234197,0.161313,81.900794,0.049624,12.8,1.0
1,0.48521,0.152747,0.054124,0.235429,0.161737,37.856797,0.045829,11.1,0.5
2,0.485211,0.152835,0.054122,0.23543,0.161737,19.123307,0.046301,9.65,0.25
3,0.485215,0.153071,0.054107,0.235433,0.161738,9.315664,0.04511,10.4,0.125
4,0.480997,0.185995,-0.814573,0.231358,0.240498,0.044447,2.34908,9.45,100.0
5,0.507967,0.153145,0.042113,0.258031,0.169322,0.096564,0.510354,6.8,1000.0
6,0.501422,0.153644,0.053885,0.251424,0.167141,0.146014,0.077171,7.6,10000.0
7,0.482321,0.153901,0.053976,0.232634,0.160774,0.908608,0.048021,11.25,100000.0


In [21]:
# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Calculate total metrics
total_avg_cpu_usage = total_cpu_usage / len(sample_sizes)
normalized_total_time = total_execution_time / total_memory_usage_MB

# Convert total execution time to minutes and seconds
total_minutes = int(total_execution_time // 60)
total_seconds = total_execution_time % 60

# Display total metrics
print(f"Total Execution Time for Entire Process (Raw): {total_minutes} minutes and {total_seconds:.2f} seconds")
print(f"Total Normalized Execution Time for Entire Process: {normalized_total_time:.8f} seconds per MB")
print(f"Total Average CPU Usage for Entire Process: {total_avg_cpu_usage:.2f}%")

# Display the metrics DataFrame
metrics_df

Total Execution Time for Entire Process (Raw): 2 minutes and 29.39 seconds
Total Normalized Execution Time for Entire Process: 0.04792777 seconds per MB
Total Average CPU Usage for Entire Process: 9.88%


Unnamed: 0,RMSE,MAPE,R2,MSE,nRMSE,Execution Time (Raw),Normalized Time (s/MB),Average CPU Usage,Sample Size
0,0.483939,0.154094,0.061912,0.234197,0.161313,81.900794,0.049624,12.8,1.0
1,0.48521,0.152747,0.054124,0.235429,0.161737,37.856797,0.045829,11.1,0.5
2,0.485211,0.152835,0.054122,0.23543,0.161737,19.123307,0.046301,9.65,0.25
3,0.485215,0.153071,0.054107,0.235433,0.161738,9.315664,0.04511,10.4,0.125
4,0.480997,0.185995,-0.814573,0.231358,0.240498,0.044447,2.34908,9.45,100.0
5,0.507967,0.153145,0.042113,0.258031,0.169322,0.096564,0.510354,6.8,1000.0
6,0.501422,0.153644,0.053885,0.251424,0.167141,0.146014,0.077171,7.6,10000.0
7,0.482321,0.153901,0.053976,0.232634,0.160774,0.908608,0.048021,11.25,100000.0
