In [11]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif, SelectPercentile, VarianceThreshold
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import MinMaxScaler

sys.path.append('../../../')

from src.emotion.prediction.aggregates.train import HyperparaSearch
from src.emotion.prediction.aggregates.models import MODELS
from src.emotion.utils.constants import DATA_DIR

%matplotlib inline

In [12]:
targets = pd.read_csv('/home/moritz/Workspace/masterthesis/data/perma_scores_dataset.csv')
features = pd.read_csv('/home/moritz/Workspace/masterthesis/data/features_dataset.csv')

In [13]:
targets.head()


Unnamed: 0,E-Mail-Adresse,First Name,Last Name/Surname,Day,P,E,R,M,A
0,seabs023@mit.edu,Adam,Seabert,10,-0.691763,0.375963,-1.947447,-0.689777,-1.78531
1,ethan13@mit.edu,Ethan,Lindstrom,10,-0.691763,-1.139111,-1.412543,0.800141,-1.384283
2,kennylum@mit.edu,Mun Kit Kenny,Lum,10,-0.691763,-0.002806,-0.342735,-1.062256,-1.384283
3,yofujii@mit.edu,Yosuke,Fujii,10,-0.691763,-0.381574,-0.610187,-1.062256,-0.983257
4,jbeilste@mit.edu,John,Beilstein,10,0.433394,1.1335,0.727073,0.427661,-0.181204


In [14]:
features.head()

Unnamed: 0,ClassID,Angry__variance_larger_than_standard_deviation,Angry__has_duplicate_max,Angry__has_duplicate_min,Angry__has_duplicate,Angry__sum_values,Angry__abs_energy,Angry__mean_abs_change,Angry__mean_change,Angry__mean_second_derivative_central,...,GazeDifference_Range,MutualGaze_Mean,MutualGaze_StdDev,MutualGaze_Min,MutualGaze_Max,MutualGaze_Range,Std_X_Center,Std_Y_Center,E-Mail-Adresse,Day
0,person_id2,0.0,0.0,0.0,0.0,2792.637265,360.799594,0.000573,-2e-06,-2.220699e-08,...,0.119411,0.668956,0.573385,0.006868,1.0,0.993132,61.431688,7.762492,emlauber@mit.edu,10
1,person_id1,0.0,0.0,0.0,0.0,1772.580875,358.464836,0.000687,-1e-06,2.949638e-08,...,0.990794,0.351151,0.562361,0.004454,1.0,0.995546,12.874554,8.917808,emlauber@mit.edu,12
2,person_id4,0.0,0.0,0.0,0.0,1053.092517,153.263342,0.000619,2.4e-05,-1.724131e-07,...,0.994648,0.504843,0.700258,0.009685,1.0,0.990315,26.467407,21.31317,emlauber@mit.edu,13


In [19]:
df = pd.merge(features, targets, on=["E-Mail-Adresse", "Day"])

df.head()

Unnamed: 0,ClassID,Angry__variance_larger_than_standard_deviation,Angry__has_duplicate_max,Angry__has_duplicate_min,Angry__has_duplicate,Angry__sum_values,Angry__abs_energy,Angry__mean_abs_change,Angry__mean_change,Angry__mean_second_derivative_central,...,Std_Y_Center,E-Mail-Adresse,Day,First Name,Last Name/Surname,P,E,R,M,A
0,person_id2,0.0,0.0,0.0,0.0,2792.637265,360.799594,0.000573,-2e-06,-2.220699e-08,...,7.762492,emlauber@mit.edu,10,Emily,Lauber,0.058341,1.512268,-0.342735,-2.179694,0.620848
1,person_id4,0.0,0.0,0.0,0.0,1053.092517,153.263342,0.000619,2.4e-05,-1.724131e-07,...,21.31317,emlauber@mit.edu,13,Emily,Lauber,1.55855,1.512268,1.261978,1.5451,1.422901


In [20]:
# Handle Missing Values

df.dropna(axis=1, how='any', inplace=True)
df = dataset.loc[:, (df != 0).any(axis=0)]

print(df.shape)


(2, 8936)


In [21]:
# Detect outliers

# Check if all PERMA values are the same in each row
same_PERMA = (df['P'] == df['E']) & (df['E'] == df['R']) & (df['R'] == df['M']) & (df['M'] == df['A'])
# Remove the rows where all PERMA values are the same
df = df[~same_PERMA]
print(df.shape)

# find columns where all values are the same
cols_to_drop = [col for col in df.columns if df[col].nunique() == 1]
# drop the columns
df = df.drop(cols_to_drop, axis=1)
print(df.shape)

(2, 8936)
(2, 8616)


In [None]:
# Load X and Y
# Store the PERMA values in Y
Y = df[['P', 'E', 'R', 'M', 'A']]

# Store the other columns in X
X = df.drop(columns=['ClassID', 'E-Mail-Adresse', 'Day', 'First Name', 'Last Name/Surname', 'P', 'E', 'R', 'M', 'A'])

In [None]:
# Scale Features

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to the dataframe and transform the dataframe
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [None]:
corr_matrix = X.corr()

In [None]:
def plot_correlation_matrix(data):
    corr_matrix = data.corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)

In [None]:
# Feature selection
# Step 1: Identify feature clusters
# Create a dendrogram using hierarchical clustering
linkage = hierarchy.linkage(corr_matrix, method='complete')
plt.figure(figsize=(10, 5))
plt.title('Dendrogram')
plt.xlabel('Data points')
plt.ylabel('Distance')
hierarchy.dendrogram(
    linkage,
    leaf_rotation=0.,  # Rotate x-axis labels
    leaf_font_size=12.,  # Font size for x-axis labels
)
plt.show()

# Get clusters from the dendrogram
max_d = 0.5  # Maximum distance between clusters
clusters = hierarchy.fcluster(linkage, max_d, criterion='distance')
print('Clusters:', clusters)

# Group columns by cluster
df = pd.DataFrame(corr_matrix)
df.columns = ['col_' + str(i) for i in range(df.shape[1])]
df['cluster'] = clusters
grouped = df.groupby('cluster')

# Print columns in each cluster
for name, group in grouped:
    print('Cluster', name, ':', list(group.columns[:-1]))
    
    
# Step 2: Select features from each cluster
# Identify column with highest correlation to target variable for each cluster
selected_cols = []
for name, group in grouped:
    cols = list(group.columns[:-1])
    max_corr = -1
    selected_col = None
    for col in cols:
        corr = np.corrcoef(group[col], Y)[0, 1]
        if corr > max_corr:
            max_corr = corr
            selected_col = col
    selected_cols.append(selected_col)
    
# Step 3: Select features from each cluster
selected_df = df[selected_cols]

In [None]:
plot_correlation_matrix(selected_df)

In [None]:
# Feature selection
# Step 1: Find index of feature columns with correlation greater than the threshold
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

threshold = 0.8
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# Step 2: Select one feature from each highly correlated pair
chosen_features = []
for col in to_drop:
    correlated_cols = list(upper.index[upper[col] > threshold])
    importance_scores = [Y[c].corr(X[col]) for c in correlated_cols]
    best_idx = np.argmax(importance_scores)
    chosen_features.append(correlated_cols[best_idx])

# Step 3: Remove the remaining features
X = X.drop(to_drop, axis=1)
X = X[chosen_features]

In [None]:
plot(X)

In [None]:
# Ensemble feature selection using voting across multiple filter methods
def ensemble_feature_selection(X, y, filter_methods):
    # Apply each filter method to the dataset
    selected_features = []
    for name, method in filter_methods.items():
        method.fit(X, y)
        selected = method.get_support()
        selected_features.append(selected)
        print(f"{name}: {selected}")
    
    # Combine the results using a voting classifier
    votes = [(name, method) for name, method in filter_methods.items()]
    voting_clf = VotingClassifier(estimators=votes, voting='hard')
    voting_clf.fit(X, y)
    selected_votes = voting_clf.transform(X)
    selected_features.append(selected_votes)
    print(f"Voting: {selected_votes}")
    
    # Get the final set of features by taking the majority vote
    selected_features = sum(selected_features)
    selected_features = selected_features >= (len(filter_methods) + 1) // 2
    
    return selected_features

# Define the filter methods as a dictionary
filter_methods = {
    'f_classif': SelectKBest(f_classif, k=5),
    'chi2': SelectKBest(chi2, k=5),
    'mutual_info': SelectKBest(mutual_info_classif, k=5),
    'percentile': SelectPercentile(score_func=f_classif, percentile=50),
    'variance_threshold': VarianceThreshold(threshold=0.1)
}
# Perform ensemble feature selection
selected_features = ensemble_feature_selection(X, y, filter_methods)

# Print the selected features
df_filtered = df.iloc[:, selected_features]
df_filtered.head()

In [None]:
# Define the input and target vectors

hyper_search = HyperparaSearch(models=MODELS)

results = hyper_search.run(X, Y, save=True)