Make Functions Work for Classification Export 41


In [1]:
import os
import pandas as pd
from pandasgui import show
import numpy as np
from fancyimpute import IterativeImputer
from datetime import datetime

In [2]:
# <41-1> Function to process Export 41 noise data
def generate_new_41(file_path):
    import os
    import pandas as pd
    
    file_list = []
    for file in os.listdir(file_path):
        file_list.append(os.path.join(file_path, file))
    print(file_list)

    for file in file_list:
        # Read Files
        df = pd.read_csv(file, delimiter=';')

        # Convert 'result_timestamp' column to datetime format
        df['result_timestamp'] = pd.to_datetime(df['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')

        # create new columns for year, month, day, hour, minute, and weekday
        df['year'] = df['result_timestamp'].dt.year
        df['month'] = df['result_timestamp'].dt.month
        df['day'] = df['result_timestamp'].dt.day
        df['hour'] = df['result_timestamp'].dt.hour
        df['minute'] = df['result_timestamp'].dt.minute
        df['10min'] = df['minute'].apply(lambda x: int(x/10)*10)
        df['weekday'] = df['result_timestamp'].dt.weekday
        
        # Create the new DataFrame with the desired columns
        new_df = pd.DataFrame()
        new_df['description'] = df['description']
        new_df['year'] = df['year']
        new_df['month'] = df['month']
        new_df['day'] = df['day']
        new_df['hour'] = df['hour']
        new_df['weekday'] = df['weekday']
        new_df['10min'] = df['10min']
        new_df['noise_event_laeq_model_id'] = df['noise_event_laeq_model_id']
        new_df['noise_event_laeq_primary_detected_certainty'] = df['noise_event_laeq_primary_detected_certainty']
        new_df['noise_event_laeq_primary_detected_class'] = df['noise_event_laeq_primary_detected_class']
        
        # Define the new file path
        new_file_path = file[:-4] + '_new.csv'
        
        # Save the new DataFrame as a CSV file
        new_df.to_csv(new_file_path, index=False)

In [3]:
file_path = r'C:\Users\LIE\MDA Project\Sound in Street\export_41\To clean and train'
generate_new_41(file_path)

['C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\csv_results_41_255439_mp-01-naamsestraat-35-maxim.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\csv_results_41_255440_mp-02-naamsestraat-57-xior.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\csv_results_41_255441_mp-03-naamsestraat-62-taste.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\csv_results_41_255442_mp-05-calvariekapel-ku-leuven.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\csv_results_41_255443_mp-06-parkstraat-2-la-filosovia.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\csv_results_41_255444_mp-07-naamsestraat-81.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\csv_results_41_280324_mp08bis---vrijthof.csv']


In [4]:
# <41-2> Function to create a timestamp column to each site of the Export41, then export
def  create_timestamp(min_time, max_time, file_path):
    import pandas as pd
    from datetime import datetime
    import os
    # Define min and max time
    min_time = pd.to_datetime(min_time)
    max_time = pd.to_datetime(max_time)
    # Create a new dataframe with all possible combinations of time variables
    time_df = pd.DataFrame({'datetime': pd.date_range(min_time, max_time, freq='10min')})

    # Create a list of all the files in the directory
    file_list = []
    for file in os.listdir(file_path):
        file_list.append(os.path.join(file_path, file))
    # Read Files and merge the files with TimeFrame dataframe.
    for file in file_list:    
        df = pd.read_csv(file)
        df['datetime'] = df.apply(lambda x: datetime(x['year'], x['month'], x['day'], x['hour'], int(x['10min'])), axis=1)
        merged_df = pd.merge(time_df, df, how='outer', on='datetime')
        new_file_path = file[:-4] + '_Timestamp.csv'        
        merged_df.to_csv(new_file_path, index=False)

In [5]:
file_path = r'C:\Users\LIE\MDA Project\Sound in Street\export_41\To clean and train\To align with time'
create_timestamp('2022-01-01 00:00', '2023-01-01 00:00', file_path)

In [6]:
#<41-3> Join 42 and Weather
def join_weather42(file_path, weatherFile):
    import os
    import pandas as pd
    from datetime import datetime

    df_weather = pd.read_csv(weatherFile)
    # Rename the column in df_weather to match the column name in df
    df_weather = df_weather.rename(columns={'DATE_BRU': 'datetime'})
    df_weather['datetime'] = df_weather['datetime'].str[:-6]
    file_list = []
    for file in os.listdir(file_path):
        file_list.append(os.path.join(file_path, file))
    print(file_list)
    for file in file_list:
        df = pd.read_csv(file)
        merged_df = pd.merge(df, df_weather, on='datetime', how='outer')
        print(merged_df['noise_event_laeq_primary_detected_class'].value_counts())
        
        # Remove the row 'noise_event_laeq_primary_detected_class' with missing values
        merged_df = merged_df.dropna(subset=['noise_event_laeq_primary_detected_class'])
        print(merged_df['noise_event_laeq_primary_detected_class'].value_counts())
        
        new_file_path = file[:-4] + '_weather.csv'        
        merged_df.to_csv(new_file_path, index=False)

In [7]:
file_path = r'C:\Users\LIE\MDA Project\Sound in Street\export_41\To clean and train\To align with time\To join with weather'
weatherFile = r'C:\Users\LIE\MDA Project\Weather\lc_2022_avgbyid_timecoverted.csv'
join_weather42(file_path, weatherFile)

['C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\To align with time\\To join with weather\\csv_results_41_255439_mp-01-naamsestraat-35-maxim_new_Timestamp.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\To align with time\\To join with weather\\csv_results_41_255440_mp-02-naamsestraat-57-xior_new_Timestamp.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\To align with time\\To join with weather\\csv_results_41_255441_mp-03-naamsestraat-62-taste_new_Timestamp.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\To align with time\\To join with weather\\csv_results_41_255442_mp-05-calvariekapel-ku-leuven_new_Timestamp.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Street\\export_41\\To clean and train\\To align with time\\To join with weather\\csv_results_41_255443_mp-06-parkstraat-2-la-filosovia_new_Timestamp.csv', 'C:\\Users\\LIE\\MDA Project\\Sound in Stree

In [8]:
# <41-4> Function to cancatenate the Export41+weather files into one file for machine learning or visualisation

def concatenate_files(file_path):
    import os
    # Create a list of all the files in the directory
    file_list = []
    for file in os.listdir(file_path):
        file_list.append(os.path.join(file_path, file))

    # Read files, concatenate, then export.
    import pandas as pd
    merged_df = pd.DataFrame()
    for file in file_list:    
        df = pd.read_csv(file)
        merged_df = pd.concat([merged_df, df])
    # Export merged_df
    new_file_path = file[:-4]+'_Cancatenated.csv'
    merged_df.to_csv(new_file_path, index=False)

In [9]:
file_path = r'C:\Users\LIE\MDA Project\Sound in Street\export_41\To clean and train\To align with time\To join with weather\To concatenate'
concatenate_files(file_path)


Apply Machine Learning: I ran this on Google Colab

In [49]:
# Random Forests

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Read the file
df_41 = pd.read_csv('/content/drive/MyDrive/KUL 2022-2024/SS2022/Modern Data Analytics/Noise/Export 41 and weather.csv') #, nrows = 25000

# Define x and y
x = df_41[['hour', 'month', 'weekday', 'description', 'LC_HUMIDITY', 'LC_DWPTEMP', 'LC_n', 'LC_WINDDIR', 'LC_WINDSPEED', 'LC_TEMP_QCL0']]
#'hour', 'month', 'weekday', 'description', 'LC_HUMIDITY', 'LC_DWPTEMP', 'LC_n', 'LC_RAD', 'LC_RAININ', 'LC_DAILYRAIN', 'LC_WINDDIR', 'LC_WINDSPEED', 'LC_RAD60', 'LC_TEMP_QCL0', 'LC_TEMP_QCL1', 'LC_TEMP_QCL2', 'LC_TEMP_QCL3'
y = df_41['noise_event_laeq_primary_detected_class']

# Initialise the label encoders
label_encoder_x = LabelEncoder()
label_encoder_y = LabelEncoder()

# Encode the 'description' feature as class
x.loc[:, 'description'] = label_encoder_x.fit_transform(x['description'])
# x['description'] = label_encoder_x.fit_transform(x['description'])

# Encode the target variable
y_encoded = label_encoder_y.fit_transform(y)

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.25, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(x_train, y_train)

# Perform 10-fold cross-validation
scores = cross_val_score(model, x_train, y_train, cv=10, scoring='neg_mean_squared_error')

# Calculate the mean squared error across folds
mse_cv = -scores.mean()
print(mse_cv)

# Show accuracy during training
accuracy_train = model.score(x_train, y_train)
print("Train Accuracy:", accuracy_train)

# Predict on test set
y_pred = model.predict(x_test)

# Show accuracy during the test
accuracy_test = model.score(x_test, y_test)
print("Test Accuracy:", accuracy_test)

# Print important features with their importances
print("Importances:", model.feature_importances_)

Results of the Application of Random Forests on Noise 41 and Weather

Train Accuracy: 0.6553904009288434
Test Accuracy: 0.04440134775382065
Importances: [0.10718172 0.07749587 0.03636104 0.10887407 0.10937956 0.10844263
 0.12086189 0.11789303 0.11355278 0.09995742]
 
Test accuracy is always poor, so far (n_estimators=100) can give the test accuracy about 0.2;
Adjust the features did not give good prediction too.

In [None]:
# Apply Neuronal Network on Noise 41 and Weather

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor

# Read the file
df_41 = pd.read_csv('/content/drive/MyDrive/KUL 2022-2024/SS2022/Modern Data Analytics/Noise/Export 41 and weather.csv')

# Define x and y
x = df_41[['hour', 'month', 'weekday', 'description', 'LC_HUMIDITY', 'LC_DWPTEMP', 'LC_n', 'LC_WINDDIR', 'LC_WINDSPEED', 'LC_TEMP_QCL0']]
y = df_41['noise_event_laeq_primary_detected_class']

# Initialise the label encoders
label_encoder_x = LabelEncoder()
label_encoder_y = LabelEncoder()

# Encode the 'description' feature as class
x.loc[:, 'description'] = label_encoder_x.fit_transform(x['description'])

# Encode the target variable
y_encoded = label_encoder_y.fit_transform(y)

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.25, random_state=42)

# No need to perform data imputation, already cleaned
# imputer = SimpleImputer()
# x_train = imputer.fit_transform(x_train)
# x_test = imputer.transform(x_test)

# Train the model
model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
model.fit(x_train, y_train)

# Predict on training set
y_train_pred = model.predict(x_train)

# Show accuracy during training
accuracy_train = r2_score(y_train, y_train_pred)
print("Train Accuracy:", accuracy_train)

# Predict on test set
y_test_pred = model.predict(x_test)

# Show accuracy during the test
accuracy_test = r2_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy_test)

# Not available for neural network models to apply importances

Results of the Application of Neuronal Network on Noise 41 and Weather

Train Accuracy: 0.18844162522660735
Test Accuracy: 0.10415058746378669