### IMPORT FUNZIONI DA SPARK_MODELS

In [1]:
from spark_models import *

### CREAZIONE SPARK SESSION

In [2]:
spark_session=initialize_spark_session()
print("Sessione Spark creata.")

Sessione Spark creata.


### CARICAMENTO DATI

In [3]:
crash_data=load_data(spark_session,"dataset/Traffic_Crashes_cleaned.csv")
crash_data.show(5)

+---------------+------------------+----------------------+------------------+-----------------+------------------+------------------+--------------------+---------+--------------------+-----------+-----------+--------------------+-------------+--------------------+-----------------------+---------+----------------+-----------+------------------+---------+--------------------+--------------+----------+-----------------+-----------+------------+-------------+----+
|CRASH_RECORD_ID|POSTED_SPEED_LIMIT|TRAFFIC_CONTROL_DEVICE|  DEVICE_CONDITION|WEATHER_CONDITION|LIGHTING_CONDITION|  FIRST_CRASH_TYPE|     TRAFFICWAY_TYPE|ALIGNMENT|ROADWAY_SURFACE_COND|ROAD_DEFECT|REPORT_TYPE|          CRASH_TYPE|       DAMAGE|DATE_POLICE_NOTIFIED|PRIM_CONTRIBUTORY_CAUSE|STREET_NO|STREET_DIRECTION|STREET_NAME|BEAT_OF_OCCURRENCE|NUM_UNITS|  MOST_SEVERE_INJURY|INJURIES_TOTAL|CRASH_HOUR|CRASH_DAY_OF_WEEK|CRASH_MONTH|    LATITUDE|    LONGITUDE|YEAR|
+---------------+------------------+----------------------+-----

### STRING INDEXING

In [4]:
indexed_data, indexer_model = apply_string_indexing(crash_data)

In [5]:
indexed_data.show(5)

+------------------------+----------------------------+----------------------+-----------------------+------------------------+----------------+---------------------+---------------+--------------------------+-----------------+-----------------+------------+-----------------------------+----------------------+---------------+------------------------+--------------------+----------------+-----------------------+-----------------+----------+------+
|POSTED_SPEED_LIMIT_index|TRAFFIC_CONTROL_DEVICE_index|DEVICE_CONDITION_index|WEATHER_CONDITION_index|LIGHTING_CONDITION_index|CRASH_TYPE_index|TRAFFICWAY_TYPE_index|ALIGNMENT_index|ROADWAY_SURFACE_COND_index|ROAD_DEFECT_index|REPORT_TYPE_index|DAMAGE_index|PRIM_CONTRIBUTORY_CAUSE_index|STREET_DIRECTION_index|NUM_UNITS_index|MOST_SEVERE_INJURY_index|INJURIES_TOTAL_index|CRASH_HOUR_index|CRASH_DAY_OF_WEEK_index|CRASH_MONTH_index|YEAR_index|target|
+------------------------+----------------------------+----------------------+--------------------

### VETTORIZZAZIONE

In [9]:
vectorized_df,feature_names = vectorize_features(indexed_data)
vectorized_df.show(5)

+--------------------+------+
|            features|target|
+--------------------+------+
|(21,[0,2,5,6,10,1...|   1.0|
|(21,[1,2,10,12,13...|   2.0|
|(21,[5,10,11,12,1...|   5.0|
|(21,[0,5,6,7,10,1...|   5.0|
|(21,[0,6,10,12,14...|   4.0|
+--------------------+------+
only showing top 5 rows



### SPLIT DATAFRAME (0,7-0,3)

In [10]:
train_df, test_df = split_data(vectorized_df)

### MULTICLASS LOGISTIC REGRESSION

In [11]:
accuracy = train_logistic_regression_cv(train_df, test_df)

print(f"Accuracy sul test set: {accuracy:.4f}")

Accuracy sul test set: 0.5029


### RANDOM FOREST CLASSIFIER (WITH FEATURE IMPORTANCE)

In [None]:
accuracy= train_random_forest_cv(train_df, test_df,feature_names=feature_names) 
print(f"Accuracy finale:{accuracy:.4f}")

# ECOC METHOD

## Import

In [1]:
from ecoc_method import *

### Definizione parametri 

In [2]:
FILEPATH = 'dataset/Traffic_Crashes_cleaned.csv'
TARGET_COLUMN = 'FIRST_CRASH_TYPE'
COLUMNS_TO_KEEP = [
    'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
    'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'ROADWAY_SURFACE_COND',
    'ROAD_DEFECT', 'CRASH_TYPE', 'NUM_UNITS', 'CRASH_HOUR',
    'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'FIRST_CRASH_TYPE'
]

### Caricamento dati e preparazione preliminare dati

In [3]:
raw_dataframe = load_crash_data(FILEPATH, COLUMNS_TO_KEEP)
X_raw, y_raw = clean_crash_target_and_separate_xy(raw_dataframe, TARGET_COLUMN)

Trovati 559 valori mancanti nel target. Le righe corrispondenti verranno rimosse.


### Suddivisione dei dati

In [4]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = split_data(X_raw, y_raw)

### Preprocessing

In [5]:
preprocessor = CrashDataPreprocessor()
X_train_proc, X_test_proc, y_train_enc, y_test_enc, label_encoder = preprocessor.prepare_for_modelling(
    X_train_raw, X_test_raw, y_train_raw, y_test_raw
)

### Inizializzazione e Addestramento

In [6]:
from sklearn.linear_model import LogisticRegression
base_estimator = LogisticRegression(
    random_state=42, 
    solver='saga',    
    max_iter=2000,      
    n_jobs=1,
    class_weight='balanced',
    C=1.0
)
ecoc_model = ECOCClassifier(
    base_estimator_instance=base_estimator,
    n_jobs=1,  
    code_size=2.0
    )
ecoc_model.train(X_train_proc, y_train_enc)

### Valutazione modello

In [None]:
class_names = label_encoder.classes_
confusion_mat = ecoc_model.evaluate(X_test_proc, y_test_enc, class_names=class_names)
ecoc_model.plot_confusion_matrix(confusion_mat, class_names=class_names)