In [13]:
import pandas as pd
import os
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import kruskal
import statsmodels.api as sm

In [2]:
raw_path = '../01 - data/02 - processed/'
df = pd.read_csv(os.path.join(raw_path, 'processed_data.csv'))

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 951 entries, 0 to 950
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Trip_Distance_km           951 non-null    float64
 1   Passenger_Count            951 non-null    float64
 2   Base_Fare                  951 non-null    float64
 3   Per_Km_Rate                951 non-null    float64
 4   Per_Minute_Rate            951 non-null    float64
 5   Trip_Duration_Minutes      951 non-null    float64
 6   Time_of_Day_Afternoon      951 non-null    float64
 7   Time_of_Day_Evening        951 non-null    float64
 8   Time_of_Day_Morning        951 non-null    float64
 9   Time_of_Day_Night          951 non-null    float64
 10  Day_of_Week_Weekday        951 non-null    float64
 11  Day_of_Week_Weekend        951 non-null    float64
 12  Traffic_Conditions_High    951 non-null    float64
 13  Traffic_Conditions_Low     951 non-null    float64

In [None]:
def reconstruct_categoricals(row):
    
    if row['Day_of_Week_Weekday'] == 1:
        day_of_week = 'Weekday'
    elif row['Day_of_Week_Weekend'] == 1:
        day_of_week = 'Weekend'
    else:
        day_of_week = 'Unknown'

    
    if row['Time_of_Day_Morning'] == 1:
        time_of_day = 'Morning'
    elif row['Time_of_Day_Afternoon'] == 1:
        time_of_day = 'Afternoon'
    elif row['Time_of_Day_Evening'] == 1:
        time_of_day = 'Evening'
    elif row['Time_of_Day_Night'] == 1:
        time_of_day = 'Night'
    else:
        time_of_day = 'Unknown'

    
    if row['Traffic_Conditions_Low'] == 1:
        traffic = 'Low'
    elif row['Traffic_Conditions_Medium'] == 1:
        traffic = 'Medium'
    elif row['Traffic_Conditions_High'] == 1:
        traffic = 'High'
    else:
        traffic = 'Unknown'

    
    if row['Weather_Clear'] == 1:
        weather = 'Clear'
    elif row['Weather_Rain'] == 1:
        weather = 'Rain'
    elif row['Weather_Snow'] == 1:
        weather = 'Snow'
    else:
        weather = 'Unknown'

    return pd.Series({
        'Day_of_Week': day_of_week,
        'Time_of_Day': time_of_day,
        'Traffic_Conditions': traffic,
        'Weather': weather
    })


df[['Day_of_Week', 'Time_of_Day', 'Traffic_Conditions', 'Weather']] = df.apply(reconstruct_categoricals, axis=1)


In [5]:

price_weekday = df.loc[df['Day_of_Week'] == 'Weekday', 'Trip_Price']
price_weekend = df.loc[df['Day_of_Week'] == 'Weekend', 'Trip_Price']

# T test
t_stat, p_value = stats.ttest_ind(price_weekday, price_weekend, equal_var=False)  # Welch's t-test

print(f'Test t Day_of_Week: t={t_stat:.3f}, p={p_value:.3f}')

def hypoteses_test(p_value):
    if p_value < 0.05:
        return  'The variance of the variable influences the target.'
    else:
        return 'The variance of the variable does not influence the target.'
    

print(hypoteses_test(p_value))

Test t Day_of_Week: t=0.997, p=0.319
The variance of the variable does not influence the target.


In [8]:
# List with the names of categorical variables
categorical_vars = ['Time_of_Day', 'Traffic_Conditions', 'Weather']

# Loop to test normality of groups for each variable
for var in categorical_vars:
    print(f'\n🔍 Testing normality for the variable: {var}')
    
    for group in df[var].unique():
        # Select the target (e.g. Delivery_Time) for the group
        group_data = df[df[var] == group]['Trip_Price']
        
        # Check that the size is sufficient for the test
        if len(group_data) >= 3:
            stat, p_value = shapiro(group_data)
            print(f'Group "{group}": n={len(group_data)} | p-value = {p_value:.4f}')
            
            if p_value > 0.05:
                print("✔️ Distribution appears a  Normal Distribution (failure to reject H0)")
            else:
                print("❌ Distribution does NOT appear a Normal Dustribution (reject H0)")
        else:
            print(f'⚠️ Group "{group}" has fewer than 3 observations. Test ignored.')


🔍 Testing normality for the variable: Time_of_Day
Group "Morning": n=265 | p-value = 0.0000
❌ Distribution does NOT appear a Normal Dustribution (reject H0)
Group "Evening": n=197 | p-value = 0.0000
❌ Distribution does NOT appear a Normal Dustribution (reject H0)
Group "Afternoon": n=400 | p-value = 0.0000
❌ Distribution does NOT appear a Normal Dustribution (reject H0)
Group "Night": n=89 | p-value = 0.0000
❌ Distribution does NOT appear a Normal Dustribution (reject H0)

🔍 Testing normality for the variable: Traffic_Conditions
Group "Low": n=424 | p-value = 0.0000
❌ Distribution does NOT appear a Normal Dustribution (reject H0)
Group "High": n=172 | p-value = 0.0000
❌ Distribution does NOT appear a Normal Dustribution (reject H0)
Group "Medium": n=355 | p-value = 0.0000
❌ Distribution does NOT appear a Normal Dustribution (reject H0)

🔍 Testing normality for the variable: Weather
Group "Clear": n=680 | p-value = 0.0000
❌ Distribution does NOT appear a Normal Dustribution (reject H0)

In [9]:


# Loop to apply the Kruskal-Wallis test
for var in categorical_vars:
    print(f'\n🧪 Kruskal-Wallis test for the variable: {var}')
    
    # Create a list with the Trip_Price data for each group
    group_data = [df[df[var] == group]['Trip_Price'] for group in df[var].unique()]
    
    # Apply the test
    stat, p_value = kruskal(*group_data)
    
    print(f'Statistic H = {stat:.4f} | p-value = {p_value:.4f}')
    
    if p_value < 0.05:
        print('❗Significant difference between at least two groups (rejects H0)')
    else:
        print('✅ No evidence of significant difference (failure to reject H0)')


🧪 Kruskal-Wallis test for the variable: Time_of_Day
Statistic H = 0.0209 | p-value = 0.9992
✅ No evidence of significant difference (failure to reject H0)

🧪 Kruskal-Wallis test for the variable: Traffic_Conditions
Statistic H = 2.3362 | p-value = 0.3110
✅ No evidence of significant difference (failure to reject H0)

🧪 Kruskal-Wallis test for the variable: Weather
Statistic H = 1.0899 | p-value = 0.5799
✅ No evidence of significant difference (failure to reject H0)


In [14]:
# Define your independent variables (features)
X = df[['Trip_Distance_km', 'Passenger_Count', 'Base_Fare', 
        'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes']]

# Define your dependent variable (target)
y = df['Trip_Price']  # Replace 'Target' with your actual target column

# Add constant term for intercept in the model
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the full summary with p-values and R-squared
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             Trip_Price   R-squared:                       0.864
Model:                            OLS   Adj. R-squared:                  0.863
Method:                 Least Squares   F-statistic:                     995.6
Date:                Thu, 24 Jul 2025   Prob (F-statistic):               0.00
Time:                        08:59:14   Log-Likelihood:                -3921.1
No. Observations:                 951   AIC:                             7856.
Df Residuals:                     944   BIC:                             7890.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   -59.45

📘 Statistical Analysis Summary
For the categorical variables, I first tested the normality of the dependent variable (Trip_Price) within each group of the categorical feature. For example, for the variable Time_of_Day, I checked whether the distribution of Trip_Price was normal within each category: Morning, Afternoon, Evening, and Night.

If all groups passed the normality test, I proceeded with ANOVA to determine if there was a statistically significant difference in the mean Trip_Price between groups.

If normality was violated in at least one group, I used the Kruskal-Wallis test, which is a non-parametric alternative to ANOVA and does not assume normal distribution.

For binary categorical variables (e.g., Is_Weekend), I used the independent t-test if both groups showed normality, or the Mann-Whitney U test if not.

For the numerical variables, I performed a multiple linear regression with Trip_Price as the dependent variable and the numerical features as predictors. I evaluated the statistical significance of each predictor using p-values:

A variable was considered statistically significant if its p-value was less than 0.05.

It’s important to note that a high coefficient does not imply high importance if the p-value is not significant. For example, Base_Fare had a relatively high coefficient but a high p-value, meaning its effect is not statistically reliable.

On the other hand, Trip_Duration_Minutes had a lower coefficient but a very low p-value, indicating strong statistical significance.

This analysis helped me identify which features truly contribute to predicting the target variable and which ones may appear impactful but are statistically weak.