In [1]:

import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression, chi2, mutual_info_regression, SelectPercentile, VarianceThreshold
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
import numpy as np
from scipy.stats import pearsonr
import multiprocessing
from scipy.cluster import hierarchy
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from collections import Counter
from pathlib import Path
import pprint


sys.path.append('../../../')

from src.emotion.prediction.aggregates.train import HyperparaSearch
from src.emotion.prediction.aggregates.models import MODELS
from src.emotion.prediction.aggregates.test import load_models, generate_predictions, plot_predictions
from src.emotion.utils.constants import DATA_DIR

%matplotlib inline

In [2]:
features = pd.read_csv('/home/moritz/Workspace/masterthesis/data/features_dataset.csv')

In [3]:
targets = pd.read_csv('/home/moritz/Workspace/masterthesis/data/perma_scores_dataset.csv')

In [4]:
df = pd.merge(features, targets, on=["E-Mail-Adresse", "Day"])

df.shape

(104, 9351)

In [5]:
# Handle Missing Values

df.dropna(axis=1, how='any', inplace=True)
#df = dataset.loc[:, (df != 0).any(axis=0)]

print(df.shape)


(104, 9348)


In [None]:
# Detect outliers

# Check if all PERMA values are the same in each row
same_PERMA = (df['P'] == df['E']) & (df['E'] == df['R']) & (df['R'] == df['M']) & (df['M'] == df['A'])
# Remove the rows where all PERMA values are the same
df = df[~same_PERMA]
print(df.shape)

# find columns where all values are the same
cols_to_drop = [col for col in df.columns if df[col].nunique() == 1]
# drop the columns
df = df.drop(cols_to_drop, axis=1)
print(df.shape)

# drop columns where all values are only 0 or 1
df = df.loc[:, ~(df.isin([0, 1]).all() & ~df.isin([0, 1]).any())]
print(df.shape)

In [None]:
# Load X and Y
# Store the PERMA values in Y
Y = df[['P', 'E', 'R', 'M', 'A']]

# Store the other columns in X
X = df.drop(columns=['ClassID', 'E-Mail-Adresse', 'Day', 'First Name', 'Last Name/Surname', 'P', 'E', 'R', 'M', 'A'])

In [None]:
# Scale Features

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to the dataframe and transform the dataframe
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [None]:
X.shape

In [None]:
corr_matrix = X.corr()

In [None]:
def plot_correlation_matrix(matrix):
    # center the matrix
    matrix = matrix - np.mean(matrix, axis=0)

    # transpose the matrix
    matrix_t = matrix.T

    # compute the correlation matrix using np.corrcoef
    corr_matrix = np.corrcoef(matrix_t)

    # create a heatmap of the correlation matrix using seaborn
    sns.set(font_scale=0.7)
    sns.heatmap(corr_matrix, cmap="YlGnBu")

In [None]:
#plot_correlation_matrix(X)

In [None]:
# Feature selection: runs in ~ 5min 
# Step 1: Identify feature clusters
# Create a dendrogram using hierarchical clustering
linkage = hierarchy.linkage(corr_matrix, method='complete')
plt.figure(figsize=(10, 5))
plt.title('Dendrogram')
plt.xlabel('Data points')
plt.ylabel('Distance')
hierarchy.dendrogram(
    linkage,
    leaf_rotation=0.,  # Rotate x-axis labels
    leaf_font_size=12.,  # Font size for x-axis labels
)
plt.show()

In [None]:
# Get clusters from the dendrogram
max_d = 15 # Maximum distance between clusters
clusters = hierarchy.fcluster(linkage, max_d, criterion='distance')

clusters

In [None]:
# Group columns by cluster
df = pd.DataFrame(corr_matrix)
df.columns = ['col_' + str(i) for i in range(df.shape[1])]
df['cluster'] = clusters
grouped = df.groupby('cluster')

# Get the size of each group
group_sizes = grouped.size()

# Plot the group sizes
plt.figure(figsize=(10,5))
plt.bar(x=group_sizes.index, height=group_sizes.values, color='blue')
plt.title('Group Sizes')
plt.xlabel('Group')
plt.ylabel('Size')
plt.show()

In [None]:
print(Y.shape)
print(X.shape)
print(len(clusters))

In [None]:
# Step 3: Calculate the correlation matrix between the columns of X and the columns of Y
corr_matrix = np.abs(np.corrcoef(X.T, Y.T)[:X.shape[1], X.shape[1]:])

# compute the row-wise averages of the matrix
avg_matrix = np.mean(corr_matrix, axis=1, keepdims=True)

# concatenate the average matrix with the group array
concat_matrix = np.concatenate([avg_matrix, clusters.reshape(len(clusters), 1)], axis=1)

# sort the concatenated matrix by group
sorted_matrix = concat_matrix[concat_matrix[:, -1].argsort()]

# find the maximum value in each group and its index
max_values = []
# iterate over the unique groups in the second column of the sorted matrix
for group in np.unique(sorted_matrix[:, 1]):
    # find the indices of rows that belong to the current group
    indices = np.where(sorted_matrix[:, 1] == group)[0]
    # get the maximum value in the first column for the current group
    max_value = np.max(sorted_matrix[indices, 0])
    # append the maximum value to the list
    max_values.append(max_value)
    
    
# find the indices of all the maximum values in the avg_matrix
max_indices = []
for max_value in max_values:
    indices = np.where(avg_matrix == max_value)[0]
    max_indices.extend(indices)

X_filtered = X.iloc[:, max_indices]
print(X_filtered.shape)



In [None]:
plot_correlation_matrix(X_filtered)

In [None]:
# # Ensemble feature selection (using voting) across multiple filter methods
# # TODO: Maybe create seperate feature sets for each PERMA dimension seperatly?
# def get_selected_features(Y, X_filtered, filter_methods):
#     selected_features = []
    
#     for i in range(Y.shape[1]):
#         y_i = Y.iloc[:, i]
#         selected_i = []
        
#         for method_name, method in filter_methods.items():
#             pipeline = Pipeline([(method_name, method), ('regressor', LinearRegression())])
#             pipeline.fit(X_filtered, y_i)
#             selected_i.append(pipeline.named_steps[method_name].get_support(indices=True))
        
#         selected_i = np.concatenate(selected_i)
#         selected_i = np.unique(selected_i)
#         selected_features.append(selected_i)
        
#     return np.concatenate(selected_features)

# filter_methods = {
#     'f_regression': SelectKBest(f_regression, k=1),
#     'mutual_info_regression': SelectKBest(mutual_info_regression, k=1),
#     'variance_threshold': VarianceThreshold(threshold=0.1),
# }

# all_selected_features = get_selected_features(Y, X_filtered, filter_methods)

# print("Fused selected features:", all_selected_features)




In [None]:
def get_selected_voting_features_multi(Y, X_filtered, filter_methods, k):
    feature_counts = np.zeros(X_filtered.shape[1])
    
    #print(Y.shape[1])
    print(len(Y.T))
    
    for i in range(len(Y.T)):
        y_i = Y.iloc[:, i]
        
        for method_name, method in filter_methods.items():
            pipeline = Pipeline([(method_name, method), ('regressor', LinearRegression())])
            pipeline.fit(X_filtered, y_i)
            selected_i = pipeline.named_steps[method_name].get_support(indices=True)
            
            # Increment the count for each selected feature
            for index in selected_i:
                feature_counts[index] += 1
                
    # Get the indices of the top k features with the most counts
    top_k_features = np.argsort(feature_counts)[-k:]
    
    return top_k_features

In [None]:
filter_methods = {
    'f_regression': SelectKBest(f_regression, k=10),
    'mutual_info_regression': SelectKBest(mutual_info_regression, k=10),
    'variance_threshold': VarianceThreshold(threshold=0.1),
}

all_selected_features = get_selected_voting_features_multi(Y, X_filtered, filter_methods, k=15)

print("Fused selected features:", all_selected_features)


In [None]:
X_final = X_filtered.iloc[:, list(set(all_selected_features))]
plot_correlation_matrix(X_final)
column_names = X_final.columns.tolist()
print(column_names)

In [None]:
models_to_drop = ["MLPRegressor", "KNeighborsRegressor", "DecisionTreeRegressor", "GradientBoostingRegressor", "SVR"]

for name in models_to_drop:
    for i in range(len(MODELS)):
        if MODELS[i]["name"] == name:
            del MODELS[i]
            break
        
for model in MODELS:
    print(model["name"])

In [None]:
# Runs in ~5 min for n_fols=5
search = HyperparaSearch(models=MODELS, metrics=["mean_absolute_error"], n_folds=5, n_jobs=-1)

results = search.run(X_final, Y, save=False)

In [None]:
best_feats_dict = {model[0]['name']: model[0]['best_feats'] for model in results}
#print(best_feats_dict)

In [None]:
for model in best_feats_dict.keys():
    print(f"Algorithm: {model}")
    # Get the feature importance values for the algorithm
    feat_imp_vals = best_feats_dict[model]
    # Map the feature importance values with the feature list using a dictionary comprehension
    feat_imp_map = {column_names[i]: feat_imp_vals[i] for i in range(len(column_names))}
    # Rank the features by their importance value in descending order
    ranked_feats = sorted(feat_imp_map.items(), key=lambda x: x[1], reverse=True)
    # Print the ranked features
    #print(ranked_feats)

In [None]:
eval_metric = "mean_absolute_error"

# Plot the results
mae_scores = [
    rd["score"]
    for result_list in results
    for rd in result_list
    if rd["metric"] == eval_metric
]
model_names = [
    rd["name"]
    for result_list in results
    for rd in result_list
    if rd["metric"] == eval_metric
]
plt.bar(model_names, mae_scores)
plt.title("Mean Absolute Error Scores")
plt.xlabel("Model")
plt.ylabel("Score")
plt.xticks(rotation=45) 
plt.show()

In [None]:
# Print the model with the lowest score
best_model = min(
    [
        min(
            sublist,
            key=lambda x: x["score"]
            if x["metric"] == eval_metric
            else float("inf"),
        )
        for sublist in results
    ],
    key=lambda x: x["score"],
)
print(f"Best model: {best_model['name']}")
print(f"Best params: {best_model['params']}")
print(f"Best Score: {best_model['score']}")

In [None]:
def get_selected_voting_features_uni(Y, X_filtered, filter_methods, k):
    feature_counts = np.zeros(X_filtered.shape[1])
        
    for method_name, method in filter_methods.items():
        pipeline = Pipeline([(method_name, method), ('regressor', LinearRegression())])
        pipeline.fit(X_filtered, Y)
        selected_i = pipeline.named_steps[method_name].get_support(indices=True)
        
        # Increment the count for each selected feature
        for index in selected_i:
            feature_counts[index] += 1
                
    # Get the indices of the top k features with the most counts
    top_k_features = np.argsort(feature_counts)[-k:]
    
    return top_k_features

In [None]:
# Create a dictionary for each PERMA pillar
perma_dict = {
    "P": None,
    "E": None,
    "R": None,
    "M": None,
    "A": None,
}

filter_methods = {
    'f_regression': SelectKBest(f_regression, k=10),
    'mutual_info_regression': SelectKBest(mutual_info_regression, k=10),
    'variance_threshold': VarianceThreshold(threshold=0.1),
}

# Iterate over each PERMA dimension in Y
for i, perma_dim in enumerate(Y.columns):
    # Get the selected features for the current PERMA dimension
    selected_features = get_selected_voting_features_uni(Y.iloc[:, i], X_filtered, filter_methods, k=10)
    # Save the selected features in the corresponding dictionary for the current PERMA pillar
    perma_dict[perma_dim] = selected_features

# Print the selected features for each PERMA dimension in each PERMA pillar
for pillar in perma_dict:
    print(pillar, ":", perma_dict[pillar])


In [None]:
feature_sets = {}

for perma_dim, selected_features in perma_dict.items():
    # Select the corresponding columns of X_filtered
    X_final = X_filtered.iloc[:, list(set(selected_features))]
    # Add the selected features for the current PERMA dimension to the feature sets dictionary
    feature_sets[perma_dim] = X_final
    # Print the names of the selected features
    column_names = X_final.columns.tolist()
    print(perma_dim, ":", column_names)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y, test_size=0.2, random_state=42)

In [None]:
SAVE_DIR = Path("/home/moritz/Workspace/masterthesis/model/custom_models/univariate/big")

eval_metric = "mean_absolute_error"

In [None]:
models_to_drop = ["MLPRegressor", "KNeighborsRegressor", "DecisionTreeRegressor", "SVR"]

for name in models_to_drop:
    for i in range(len(MODELS)):
        if MODELS[i]["name"] == name:
            del MODELS[i]
            break
        
for model in MODELS:
    print(model["name"])

In [None]:
for perma_dim, X_final in feature_sets.items():
    print(f"PERMA dimension: {perma_dim}")
    # Run the hyperparameter search
    models_path = SAVE_DIR / perma_dim
    search = HyperparaSearch(models=MODELS, metrics=["mean_absolute_error"], models_path = models_path, n_folds=5, n_jobs=-1, mode="uni")
    results = search.run(X_train, Y_train[perma_dim], save=True)
    # Print the model with the lowest score
    best_model = min(
        [
            min(
                sublist,
                key=lambda x: x["score"]
                if x["metric"] == eval_metric
                else float("inf"),
            )
            for sublist in results
        ],
        key=lambda x: x["score"],
    )
    print(perma_dim)
    print(f"Best model: {best_model['name']}")
    print(f"Best params: {best_model['params']}")
    print(f"Best Score: {best_model['score']}")

In [None]:
perma_models = {}

for perma_dim in ["P", "E", "R", "M", "A"]:
    print(perma_dim)
    models_path = SAVE_DIR / perma_dim
    models = load_models(models_path)
    perma_models[perma_dim] = models
    print(models)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def generate_predictions(models, X, y):
    # Generate and return a dictionary of mean absolute error (MAE) scores and prediction arrays for each model
    results = {}
    for model_name, mae_grid_search in models.items():
        # Fit the model
        model = mae_grid_search[0].best_estimator_
        # Make predictions
        y_pred = model.predict(X)
        # Calculate mean squared error and mean absolute error
        mse = mean_squared_error(y, y_pred)
        mae = mean_absolute_error(y, y_pred)
        results[model_name] = {"mae": mae, "mse": mse, "y_pred": y_pred}
    return results

In [None]:
perma_results = {}

for dim, models in perma_models.items():
    results = generate_predictions(models, X_test, Y_test[dim])
    perma_results[dim] = results
    for model_name, result in results.items():
        print(f"{dim} - {model_name}: MAE - {result['mae']}, MSE - {result['mse']}")
    

In [None]:
best_results = {}

for dim, results in perma_results.items():
    curr_mae = float("inf")
    for model in results:
        if results[model]["mae"] < curr_mae:
            curr_mae = results[model]["mae"]
            best_results[dim] = {"model": model, "mae": results[model]["mae"]}
            
print(best_results)

In [None]:
# Baseline:
Y_baseline = np.full_like(Y, Y.mean())
mae_baseline = mean_absolute_error(Y, Y_baseline, multioutput='raw_values')

print(f"Baseline MAE for each dimension: {mae_baseline}")

In [None]:
# Print the best model for each PERMA dimension
best_mae_values = [v['mae'] for v in best_results.values()]

# Define the x-axis labels and the bar width
perma_dimensions = ['P', 'E', 'R', 'M', 'A']
bar_width = 0.35

# Set up the plot
fig, ax = plt.subplots()
ax.bar(np.arange(len(perma_dimensions)), mae_baseline, width=bar_width, label='Baseline')
ax.bar(np.arange(len(perma_dimensions))+bar_width, best_mae_values, width=bar_width, label='Best Models')

# Set the x-axis ticks and labels
ax.set_xticks(np.arange(len(perma_dimensions))+bar_width/2)
ax.set_xticklabels(perma_dimensions)
ax.set_xlabel('PERMA Dimension')

# Set the y-axis label and limits
ax.set_ylabel('MAE')
ax.set_ylim([0, max(np.max(mae_baseline), np.max(best_mae_values))*1.1])

# Add the model names and MAE values above each bar
for i, v in enumerate(mae_baseline):
    ax.text(i, v+0.01, f"MAE: {v:.2f}", rotation=90, ha='center', va='bottom', fontsize=8)
    ax.text(i+bar_width, best_mae_values[i]+0.01, f"{best_results[perma_dimensions[i]]['model']}\nMAE: {best_mae_values[i]:.2f}", rotation=90, ha='center', va='bottom', fontsize=8)

# Add a legend and title
ax.legend()
ax.set_title('PERMA Dimension MAE Scores')

# Display the plot
plt.show()

In [None]:
model_dict = {
    "P": "LinearRegression",
    "E": "LinearRegression",
    "R": "Lasso",
    "M": "LinearRegression",
    "A": "LinearRegression",
}



In [None]:
labels = ['P', 'E', 'R', 'M', 'A']
data = np.array([0.5, 0.8, 0.6, 0.4, 0.9])

# Calculate the angle for each label
angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False)

# Close the plot
data = np.concatenate((data, [data[0]]))
angles = np.concatenate((angles, [angles[0]]))

# Create the figure
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111, polar=True)

# Plot the data
ax.plot(angles, data, 'o-', linewidth=2)
ax.fill(angles, data, alpha=0.25)

# Set the labels
ax.set_thetagrids(angles[:-1] * 180/np.pi, labels)
plt.yticks(np.arange(0, 1.1, 0.2))
plt.ylim(0, 1)
plt.title('PERMA', fontsize=14)