The objective is to try and separate and predict only Sweden and India separately, to see if the results get better. Attempt that both by ensuring and not ensuring that the training and test set have non-overlapping teams.

In [417]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from functions import calculate_metrics, scale_data

df_all = pd.read_excel('All_data_correct.xlsx')
df_30_features = pd.read_excel('data-30-features-all-outputs.xlsx')
df_all.drop(columns=["Responsibility", "Listening_skills", "Questioning_skills", "Team_participation", "Teamwork_oriented"], inplace=True)

scaler_X = MinMaxScaler()
scaler_Y = MinMaxScaler()
k = 5
kf = KFold(n_splits=k, shuffle=True)

In [418]:
df_se1 = df_all.loc[(df_all['country data collected'] == 'Sweden') & (df_all['Data source'] == 'Ericsson 2018')].reset_index(drop=True)
df_se2 = df_all.loc[(df_all['country data collected'] == 'Sweden') & (df_all['Data source'] == 'Ericsson 2020')].reset_index(drop=True)
df_bz = df_all.loc[(df_all['country data collected'] == 'Brazil') & (df_all['Data source'] == 'Brazil 2021')].reset_index(drop=True)
df_in = df_all.loc[df_all['country data collected'] == 'India'].reset_index(drop=True)

# Model 4 of previous research

In [419]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

df_new = df_se2.copy()
X = df_new[['Neuroticism']].values
y = df_new['Participative_safety'].values.reshape(-1,1)

results_sweden = pd.DataFrame()
model = LinearRegression()
avg_score = []
scaled = False
if max(X) != max(y):
    scaled = True
i = 0
for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    if scaled:
        X_train, X_test, y_train, y_test = scale_data(X_train, X_test, y_train, y_test, scaler_X, scaler_Y)
    
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    avg_score.append(score)
    predictions_scaled = model.predict(X_test)
    
    # Inverse transform the predictions and test targets to their original scale
    y_pred = scaler_Y.inverse_transform(predictions_scaled.reshape(-1, 1))
    y_test = scaler_Y.inverse_transform(y_test)
    
    # Make predictions
    # y_pred = model.predict(X_test)
    # Calculate metrics
    mmre, mar, pred_25, r2, mae, mse, rmse = calculate_metrics(y_test, y_pred)
    metrics = {'MMRE': mmre, 'MAR': mar, 'Pred(25)': pred_25, 'R2': r2,
                         'MAE': mae, 'MSE': mse, 'RMSE': rmse}
    
    data = pd.DataFrame([metrics], index=[i])
    results_sweden = pd.concat([results_sweden, data])
    i += 1
    
# Display the full model summary
X_sm = sm.add_constant(X)
# Train the model
model_sm = sm.OLS(y, X_sm).fit()
coefficients = model_sm.params
standard_errors = model_sm.bse
print(f"Test score: {np.mean(avg_score):.2f}%")

print("\nSummary of the trained model:")
print(model_sm.summary())

print("Coefficients and Standard Errors:")
for coef, se in zip(coefficients, standard_errors):
    print(f"Coefficient: {coef:.4f}, Standard Error: {se:.4f}")

Test score: -0.05%

Summary of the trained model:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.076
Model:                            OLS   Adj. R-squared:                  0.063
Method:                 Least Squares   F-statistic:                     5.942
Date:                Mon, 05 Aug 2024   Prob (F-statistic):             0.0173
Time:                        22:37:57   Log-Likelihood:                -72.963
No. Observations:                  74   AIC:                             149.9
Df Residuals:                      72   BIC:                             154.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co

# Model 6 of previous methodology

In [420]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

df_new = df_in.copy()
X = df_new[['Neuroticism']].values
y = df_new['Participative_safety'].values.reshape(-1,1)

results_india = pd.DataFrame()
model = LinearRegression()
avg_score = []
scaled = False
if max(X) != max(y):
    scaled = True
i = 0
for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    if scaled:
        X_train, X_test, y_train, y_test = scale_data(X_train, X_test, y_train, y_test, scaler_X, scaler_Y)
    
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    avg_score.append(score)
    predictions_scaled = model.predict(X_test)
    
    # Inverse transform the predictions and test targets to their original scale
    y_pred = scaler_Y.inverse_transform(predictions_scaled.reshape(-1, 1))
    y_test = scaler_Y.inverse_transform(y_test)
    
    # Make predictions
    # y_pred = model.predict(X_test)
    # Calculate metrics
    mmre, mar, pred_25, r2, mae, mse, rmse = calculate_metrics(y_test, y_pred)
    metrics = {'MMRE': mmre, 'MAR': mar, 'Pred(25)': pred_25, 'R2': r2,
                         'MAE': mae, 'MSE': mse, 'RMSE': rmse}
    
    data = pd.DataFrame([metrics], index=[i])
    results_india = pd.concat([results_india, data])
    i += 1
    
# Display the full model summary
X_sm = sm.add_constant(X)
# Train the model
model_sm = sm.OLS(y, X_sm).fit()
coefficients = model_sm.params
standard_errors = model_sm.bse
print(f"Test score: {np.mean(avg_score):.2f}%")

print("\nSummary of the trained model:")
print(model_sm.summary())

print("Coefficients and Standard Errors:")
for coef, se in zip(coefficients, standard_errors):
    print(f"Coefficient: {coef:.4f}, Standard Error: {se:.4f}")

Test score: -0.29%

Summary of the trained model:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.116
Model:                            OLS   Adj. R-squared:                  0.096
Method:                 Least Squares   F-statistic:                     5.774
Date:                Mon, 05 Aug 2024   Prob (F-statistic):             0.0205
Time:                        22:37:57   Log-Likelihood:                -50.010
No. Observations:                  46   AIC:                             104.0
Df Residuals:                      44   BIC:                             107.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co

In [421]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

df_new = df_se2.copy()
X = df_new[['Neuroticism']].values
y = df_new['Participative_safety'].values.reshape(-1,1)

results_sweden_split = pd.DataFrame()
model = LinearRegression()
avg_score = []
scaled = False
if max(X) != max(y):
    scaled = True
    
for i in range(k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    if scaled:
        X_train, X_test, y_train, y_test = scale_data(X_train, X_test, y_train, y_test, scaler_X, scaler_Y)
    
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    avg_score.append(score)
    predictions_scaled = model.predict(X_test)
    
    # Inverse transform the predictions and test targets to their original scale
    y_pred = scaler_Y.inverse_transform(predictions_scaled.reshape(-1, 1))
    y_test = scaler_Y.inverse_transform(y_test)
    
    # Make predictions
    # y_pred = model.predict(X_test)
    # Calculate metrics
    mmre, mar, pred_25, r2, mae, mse, rmse = calculate_metrics(y_test, y_pred)
    metrics = {'MMRE': mmre, 'MAR': mar, 'Pred(25)': pred_25, 'R2': r2,
                         'MAE': mae, 'MSE': mse, 'RMSE': rmse}
    
    data = pd.DataFrame([metrics], index=[i])
    results_sweden_split = pd.concat([results_sweden_split, data])
    
# # Display the full model summary
# X_sm = sm.add_constant(X)
# # Train the model
# model_sm = sm.OLS(y, X_sm).fit()
# coefficients = model_sm.params
# standard_errors = model_sm.bse
# print(f"Test score: {np.mean(avg_score):.2f}%")
# 
# print("\nSummary of the trained model:")
# print(model_sm.summary())
# 
# print("Coefficients and Standard Errors:")
# for coef, se in zip(coefficients, standard_errors):
#     print(f"Coefficient: {coef:.4f}, Standard Error: {se:.4f}")

In [422]:
results_sweden_split.describe()

Unnamed: 0,MMRE,MAR,Pred(25),R2,MAE,MSE,RMSE
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.108039,0.470077,100.0,-0.409425,0.470077,0.308247,0.5552
std,0.0,6.206335e-17,0.0,0.0,6.206335e-17,0.0,0.0
min,0.108039,0.470077,100.0,-0.409425,0.470077,0.308247,0.5552
25%,0.108039,0.470077,100.0,-0.409425,0.470077,0.308247,0.5552
50%,0.108039,0.470077,100.0,-0.409425,0.470077,0.308247,0.5552
75%,0.108039,0.470077,100.0,-0.409425,0.470077,0.308247,0.5552
max,0.108039,0.470077,100.0,-0.409425,0.470077,0.308247,0.5552


In [423]:
results_sweden.describe()

Unnamed: 0,MMRE,MAR,Pred(25),R2,MAE,MSE,RMSE
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.150833,0.493878,91.809524,-0.04827,0.493878,0.445385,0.643755
std,0.067167,0.099205,5.816545,0.237244,0.099205,0.285635,0.196738
min,0.096779,0.379072,85.714286,-0.436526,0.379072,0.224076,0.473366
25%,0.106693,0.446502,86.666667,-0.027166,0.446502,0.271121,0.520693
50%,0.117231,0.457441,93.333333,-0.006383,0.457441,0.314794,0.561065
75%,0.175737,0.555179,93.333333,0.016135,0.555179,0.494544,0.703238
max,0.257725,0.631194,100.0,0.212592,0.631194,0.92239,0.960411


In [424]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm

df_new = df_in.copy()
X = df_new[['Neuroticism']].values
y = df_new['Participative_safety'].values.reshape(-1,1)

results_india_split = pd.DataFrame()
model = LinearRegression()
avg_score = []
scaled = False
if max(X) != max(y):
    scaled = True
    
for i in range(k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    if scaled:
        X_train, X_test, y_train, y_test = scale_data(X_train, X_test, y_train, y_test, scaler_X, scaler_Y)
    
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    avg_score.append(score)
    predictions_scaled = model.predict(X_test)
    
    # Inverse transform the predictions and test targets to their original scale
    y_pred = scaler_Y.inverse_transform(predictions_scaled.reshape(-1, 1))
    y_test = scaler_Y.inverse_transform(y_test)
    
    # Make predictions
    # y_pred = model.predict(X_test)
    # Calculate metrics
    mmre, mar, pred_25, r2, mae, mse, rmse = calculate_metrics(y_test, y_pred)
    metrics = {'MMRE': mmre, 'MAR': mar, 'Pred(25)': pred_25, 'R2': r2,
                         'MAE': mae, 'MSE': mse, 'RMSE': rmse}
    
    data = pd.DataFrame([metrics], index=[i])
    results_india_split = pd.concat([results_india_split, data])
    
# # Display the full model summary
# X_sm = sm.add_constant(X)
# # Train the model
# model_sm = sm.OLS(y, X_sm).fit()
# coefficients = model_sm.params
# standard_errors = model_sm.bse
# print(f"Test score: {np.mean(avg_score):.2f}%")
# 
# print("\nSummary of the trained model:")
# print(model_sm.summary())
# 
# print("Coefficients and Standard Errors:")
# for coef, se in zip(coefficients, standard_errors):
#     print(f"Coefficient: {coef:.4f}, Standard Error: {se:.4f}")

In [425]:
results_india_split.describe()

Unnamed: 0,MMRE,MAR,Pred(25),R2,MAE,MSE,RMSE
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.207003,0.707736,76.0,0.058118,0.707736,0.651748,0.800649
std,0.05869,0.115947,16.733201,0.195312,0.115947,0.201078,0.115706
min,0.162546,0.628872,50.0,-0.2768,0.628872,0.524306,0.72409
25%,0.172596,0.6409,70.0,0.066257,0.6409,0.545468,0.738558
50%,0.175043,0.674115,80.0,0.123453,0.674115,0.552238,0.743127
75%,0.220851,0.683639,90.0,0.15796,0.683639,0.632935,0.795572
max,0.303981,0.911151,90.0,0.219719,0.911151,1.003795,1.001896


In [426]:
results_india.describe()

Unnamed: 0,MMRE,MAR,Pred(25),R2,MAE,MSE,RMSE
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.177125,0.63829,74.222222,-0.293339,0.63829,0.5933,0.766347
std,0.032488,0.078197,13.662601,0.352919,0.078197,0.128332,0.086684
min,0.136122,0.556336,60.0,-0.864496,0.556336,0.399383,0.631968
25%,0.164516,0.5861,66.666667,-0.358623,0.5861,0.540036,0.734871
50%,0.17788,0.623818,66.666667,-0.198149,0.623818,0.64586,0.803654
75%,0.181369,0.669202,88.888889,-0.100768,0.669202,0.647395,0.804608
max,0.225737,0.755992,88.888889,0.055342,0.755992,0.733824,0.856635


In [427]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
import statsmodels.api as sm

# Sample DataFrame
df = df_in.copy()
# Define features and target
X = df[['Neuroticism']]
y = df['Participative_safety']
groups = df['Team ID']

# Initialize scalers
feature_scaler = StandardScaler()
target_scaler = StandardScaler()

# Fit and transform the features and target
X_scaled = feature_scaler.fit_transform(X)
y_scaled = target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()

# Initialize GroupKFold
gkf = GroupKFold(n_splits=5)

# Lists to store metrics and summaries
mmre_list = []
mar_list = []
pred25_list = []

for train_index, test_index in gkf.split(X_scaled, y_scaled, groups=groups):
    train_teams = groups.iloc[train_index].unique()
    test_teams = groups.iloc[test_index].unique()
    assert not np.any(np.isin(test_teams, train_teams)), "Data leakage detected between train and test sets!"
    
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y_scaled[train_index], y_scaled[test_index]
    
    # Add constant to the features for statsmodels
    X_train_sm = sm.add_constant(X_train)
    X_test_sm = sm.add_constant(X_test)
    
    # Fit the model using statsmodels
    model = sm.OLS(y_train, X_train_sm).fit()
    
    # Print summary
    print(f"\nFold Summary:\n{model.summary()}")
    
    # Make predictions
    y_pred = model.predict(X_test_sm)
    # Verify inverse transformation
    print(f"Original y_test[:5]: {target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()[:5]}")
    print(f"Predictions[:5]: {target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()[:5]}")
    
    # Inverse transform predictions and actual values
    y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
    y_pred_original = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
    
    # Print the indices of each fold to check variability
    # for fold, (train_index, test_index) in enumerate(gkf.split(X_scaled, y_scaled, groups=groups), 1):
    #     print(f"Fold {fold}:")
    #     print(f"Train indices: {train_index[:5]}...")  # Show a sample of indices
    #     print(f"Test indices: {test_index[:5]}...")

    # Calculate metrics
    y_test_mean = np.mean(y_test_original)
    
    # Mean Magnitude of Relative Error (MMRE)
    mmre = np.mean(np.abs((y_test_original - y_pred_original) / y_test_mean))
    mmre_list.append(mmre)
    
    # Mean Absolute Residual (MAR)
    mar = np.mean(np.abs(y_test_original - y_pred_original))
    mar_list.append(mar)
    
    # Percentage of Predictions within 25% (Pred(25))
    pred25 = np.mean(np.abs(y_test_original - y_pred_original) / y_test_original <= 0.25) * 100
    pred25_list.append(pred25)
    # Debugging output
    print(f"MMRE: {mmre:.4f}, MAR: {mar:.4f}, Pred(25): {pred25:.2f}%")
    
# Print the metrics
print(f"Average MMRE: {np.mean(mmre_list):.4f}")
print(f"Average MAR: {np.mean(mar_list):.4f}")
print(f"Average Pred(25): {np.mean(pred25_list):.2f}%")



Fold Summary:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.312
Model:                            OLS   Adj. R-squared:                  0.291
Method:                 Least Squares   F-statistic:                     14.98
Date:                Mon, 05 Aug 2024   Prob (F-statistic):           0.000485
Time:                        22:37:57   Log-Likelihood:                -44.278
No. Observations:                  35   AIC:                             92.56
Df Residuals:                      33   BIC:                             95.67
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1460      0.150     