In [1]:

import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.cluster import KMeans
import datetime
import winsound
import random
from imblearn.over_sampling import SMOTE
import scipy.stats as stats
from scipy.stats import loguniform
import sys
from openpyxl import load_workbook
from sklearn.model_selection import StratifiedShuffleSplit
import os
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_fscore_support
from imblearn.over_sampling import SVMSMOTE
import os.path
import xgboost as xgb
from scipy.stats import randint
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from scipy.stats import kendalltau
from scipy.stats import pearsonr
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy.stats import pointbiserialr
from tabulate import tabulate
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

plt.rcParams['figure.facecolor'] = 'white'
np.set_printoptions(threshold=sys.maxsize, suppress=True)

In [2]:
datatypes = {'business_area': str, 'sex': str, 'employment_status': str,
            'time': str, 'severity': str, 'enterprise_size': str,
            'citizenship': str, 'profession_code': str, 'type_of_injury': str
            , 'injured_bodypart': str, 'workstation': str,
            'working_environment': str, 'working_process': str
            , 'specific_physical_activity': str,
            'material_agent_of_physical_act.': str,
            'deviation': str, 'material_agent_of_deviation': str
            , 'contact_mode_of_injury': str, 'material_agent_of_contact_mode': str, 'general_profession_code': str, 'month': str, 'weekofyear': str, 'dayofweek': str, 'material_agent_of_physical_act': str}

In [3]:
# For reproducing results use data.csv in your working directory and comment the fist line out
os.chdir('/Users/Mario/OneDrive - Tartu Ülikool/IT_mitteinformaatikutele_MSc/Magistritöö/Final code for thesis')
df = pd.read_csv('data.csv', dtype = datatypes, encoding='latin-1', index_col=0)
df = df.drop(columns= ['date', 'time', 'datetime', 'time_ESAW', 'lost_days', 'type_of_injury', 'injured_bodypart', 'severity'])

In [4]:
# Select the desired sector and occupation class combination
sector = 'F'
occup_class = 9
df = df[(df['business_sector'] == sector)].copy()
df = df[(df['general_profession_class'] == occup_class)].copy()
df = df.drop(columns=['business_sector', 'general_profession_class'])
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [5]:
dff=df.copy()
# Check if the Excel file already exists
file_exists = os.path.isfile(ff'important_features_for{sector}{str(occup_class)}.xlsx')

# If the file doesn't exist, create a new DataFrame to store the results
if not file_exists:
    results_df = pd.DataFrame(columns=[
        "Important",
        "f1",
        "last",
        "k"
    ])
else:
    # If the file exists, load the existing DataFrame from the file
    results_df = pd.read_excel(f'important_features_for{sector}{str(occup_class)}.xlsx')

for i in range(200):

    df = dff
    feat_list = df.columns.tolist()
    feat_list.remove('target')
    important = []
    drop_list = []
    f1 = 0.000
    rs = 42
    rf_model = RandomForestClassifier(random_state=rs)
    lgbm_model = LGBMClassifier(random_state=rs)
    xgb_model = xgb.XGBClassifier(random_state=rs)
    svm_model = SVC(kernel='rbf', probability=True, random_state=rs)
    lr_model = LogisticRegression(max_iter=1000, random_state=rs)
    model_in_testing = lgbm_model


    for _ in range(len(feat_list)):

        k = random.choice(feat_list)
        feat_list.remove(k)

        df = dff.drop(columns=drop_list, axis=1)

        all_object_cols = list(df.select_dtypes(include=['object']).columns)
        df = pd.get_dummies(df, columns=all_object_cols)

        # Separate independent variables and dependent variable for train set
        X = df.drop(columns=['target'])
        y = df['target']

        # Balance the target class using random over-sampling
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Scale float64 variables
        float_cols = X_train.select_dtypes(include='float64').columns.tolist()
        scaler = MinMaxScaler(feature_range=(0,1))
        X_train[float_cols] = scaler.fit_transform(X_train[float_cols])
        X_test[float_cols] = scaler.transform(X_test[float_cols])

        under_sampler = RandomUnderSampler(random_state=rs)
        X_train, y_train = under_sampler.fit_resample(X_train, y_train)

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=rs)   
        cv_scores = cross_val_score(model_in_testing, X_train, y_train, cv=cv, scoring='f1')

        model_in_testing.fit(X_train, y_train)
        
        y_pred = model_in_testing.predict(X_test)
        y_pred_prob = model_in_testing.predict_proba(X_test)[:,1]
        
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        auc_roc = roc_auc_score(y_test, y_pred_prob)
        fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
        f1_class_0 = f1_score(y_test, y_pred, pos_label=0)
        f1_class_1 = f1_score(y_test, y_pred, pos_label=1)
        avg_f1 = f1_score(y_test, y_pred, average='weighted')


        if f1_class_1 > (f1 - 0.0005):
            f1 = f1_class_1
            imp = dff.drop(columns=drop_list, axis=1).columns.tolist()
            imp.remove('target')
            drop_list.append(k)
            last_element = k
        else:
            if drop_list:
                drop_list.pop()
            else:
                print("The list is already empty.")

    # Create a dictionary of results
    result = {
        "Important": "', '".join(imp),
        "f1": f1,
        "last": last_element,
        "k": k
    }
    # Append the result to the results DataFrame
    results_df = pd.concat([results_df, pd.DataFrame(result, index=[0])], ignore_index=True)
    print(result)

# Save the results DataFrame to the Excel file
results_df.to_excel(f'important_features_for{sector}{str(occup_class)}.xlsx', index=False)

{'Important': "business_area', 'age', 'employment_years', 'full_hours_from_startofwork', 'location', 'citizenship', 'profession_code', 'workstation', 'working_environment', 'working_process', 'specific_physical_activity', 'material_agent_of_physical_act', 'deviation', 'material_agent_of_deviation', 'contact_mode_of_injury', 'material_agent_of_contact_mode', 'enterprise_size_ordinal_enc', 'dayofweek', 'month', 'sin_time', 'cos_time', 'is_business_hour', 'temperature', 'rain', 'snowfall', 'cause code 001', 'cause code 003', 'cause code 004', 'cause code 007', 'cause code 008', 'cause code 011', 'cause code 015', 'cause code 019', 'cause code 025", 'f1': 0.5517241379310345, 'last': 'workstation', 'k': 'month'}
{'Important': "business_area', 'age', 'employment_status', 'employment_years', 'full_hours_from_startofwork', 'location', 'citizenship', 'profession_code', 'workstation', 'working_process', 'specific_physical_activity', 'material_agent_of_physical_act', 'deviation', 'material_agent_