In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

dataset = pd.read_csv("dataset/nps_simulated_dataset_gaussiano_0408_v4_reduced.csv")
print("Number of rows: ", dataset.shape[0])
print("Number of columns: ", dataset.shape[1])
dataset.head(20)

In [None]:
dataset.info(verbose=True)

In [None]:
dataset.isnull().sum()

In [None]:
dataset.describe()

In [None]:
print(dataset["wind_dir"][1])
print(type(dataset["wind_dir"][1]))


In [None]:
datset_copy=dataset.copy()

In [None]:
def convert_string_to_array(s):
	if isinstance(s, str):
		s = s.replace('[', '').replace(']', '')
		return np.fromstring(s, sep=',')
	return s

datset_copy["wind_dir"] = datset_copy["wind_dir"].apply(convert_string_to_array)
#datset_copy["concentration"] = datset_copy["concentration"].apply(convert_string_to_array)

In [None]:
datset_copy["wind_dir"]

In [None]:
datset_copy.info(verbose=True)

In [None]:
datset_copy_subset = datset_copy[["sensor_x", "sensor_y", "sensor_noise", "sensor_height", "days",
                                  "wind_speed", "source_x", "source_y", "source_h", "emission_rate", "RH"]]
datset_copy_subset.hist(figsize=(20, 20), bins=50)
plt.suptitle("Istogrammi per la distribuzione delle feature", fontsize=27)
plt.show()

In [None]:
categorical_cols =["stability_profile", "stability_value", "aerosol_type", "wind_type", "humidify"]

for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(data=datset_copy, x=col)
    plt.title(f"Distribuzione di {col}")
    plt.xlabel("Categoria")
    plt.ylabel("Frequenza")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
datset_copy = datset_copy.drop(columns=["simulation_id", "sensor_id", "stability_profile","sensor_height"])
datset_copy.drop(columns=["days"], inplace=True)
datset_copy.info(verbose=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from windrose import WindroseAxes

def plot_windrose_samples(dataset, indices):
    if isinstance(indices, int):
        indices = [indices]

    for i in indices:
        direzioni = np.array(dataset["wind_dir"].iloc[i])
        frequenze = np.ones_like(direzioni)  # peso uniforme per ogni direzione

        fig = plt.figure(figsize=(6,6))
        ax = WindroseAxes.from_ax(fig=fig)
        ax.bar(direzioni, frequenze, normed=True, opening=0.8, edgecolor='white')
        ax.set_legend(title="Frequenza (%)")
        ax.set_title(f"Rosa dei venti - direzione solo - campione {i}")
        plt.show()

plot_windrose_samples(datset_copy, [0, 10, 100, 250, 1000]) 


In [None]:
# Convert string arrays to numpy arrays and calculate mean direction for each entry
def calculate_mean_direction(wind_dir_array):
	if isinstance(wind_dir_array, str):
		# Convert string representation to numpy array
		wind_dir_array = np.fromstring(wind_dir_array.replace('[', '').replace(']', ''), sep=',')
	
	# Convert to radians
	wind_dir_rad = np.radians(wind_dir_array)
	
	# Calculate directional vectors
	cos_vals = np.cos(wind_dir_rad)
	sin_vals = np.sin(wind_dir_rad)
	
	# Calculate mean
	mean_cos = np.mean(cos_vals)
	mean_sin = np.mean(sin_vals)
	
	return mean_cos, mean_sin

# Apply to all wind directions
mean_directions = np.array([calculate_mean_direction(wd) for wd in datset_copy["wind_dir"]])

# Save mean wind direction components to dataset
datset_copy['wind_dir_cos'] = mean_directions[:, 0]
datset_copy['wind_dir_sin'] = mean_directions[:, 1]

In [None]:
datset_copy.drop(columns=["wind_dir"], inplace=True)

In [None]:
datset_copy_corr = datset_copy.copy()


wind_type_map = {
    "CONSTANT": 1,
    "FLUCTUATING": 2,
    "PREVAILING": 3
}

pg_stability_map = {
    "PasquillGiffordStability.VERY_UNSTABLE": 1,
    "PasquillGiffordStability.MODERATELY_UNSTABLE": 2,
    "PasquillGiffordStability.SLIGHTLY_UNSTABLE": 3,
    "PasquillGiffordStability.NEUTRAL": 4,
    "PasquillGiffordStability.MODERATELY_STABLE": 5,
    "PasquillGiffordStability.VERY_STABLE": 6
}

nps_type_map = {
    "CANNABINOID_ANALOGUES": 0,
    "CATHINONE_ANALOGUES": 1,
    "PHENETHYLAMINE_ANALOGUES": 2,
    "PIPERAZINE_ANALOGUES": 3,
    "TRYPTAMINE_ANALOGUES": 4,
    "FENTANYL_ANALOGUES": 5,
    "OTHER_COMPOUNDS": 6
}


datset_copy_corr['wind_type'] = datset_copy_corr['wind_type'].astype(str)
datset_copy_corr['stability_value'] = datset_copy_corr['stability_value'].astype(str)
datset_copy_corr['aerosol_type'] = datset_copy_corr['aerosol_type'].astype(str)

datset_copy_corr['wind_type'] = datset_copy_corr['wind_type'].map(wind_type_map)
datset_copy_corr['stability_value'] = datset_copy_corr['stability_value'].map(pg_stability_map)
datset_copy_corr['aerosol_type'] = datset_copy_corr['aerosol_type'].map(nps_type_map)

datset_copy_corr[["wind_type", "stability_value", "aerosol_type"]].head()

In [None]:
datset_copy_corr.info(verbose=True)

In [None]:
datset_copy_corr.drop(columns=["contratio_series", "real_concentration"], inplace=True)

In [None]:
matrix_corr=datset_copy_corr.corr()
plt.figure(figsize=(15, 8))
sns.heatmap(matrix_corr, annot=True, cmap='coolwarm',fmt=".2f", linewidths=0.5)
plt.title("Matrice di Correlazione")
plt.tight_layout()
plt.show()

In [None]:
datset_copy.drop(columns=["humidify"], inplace=True)
datset_copy.info(verbose=True)

# MODEL

In [None]:
# save dataset_copy to a new CSV file
datset_copy.to_csv("dataset/nps_simulated_dataset_gaussiano_0408_v4_processed_reduced.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
datset_copy=pd.read_csv("dataset/nps_simulated_dataset_gaussiano_0408_v4_processed_reduced.csv")