In [None]:
import pandas as pd

# Specify the file path
file_path = "/home/martin/Documents/Exjobb/eed/.data/augmented_data/train/augmented_data_SE_recorded_noise_0_train.parquet"

# Load the Parquet file into a DataFrame
df = pd.read_parquet(file_path)
df["label"] = df["label"].astype(int)



In [None]:
df.describe()
#df["acc_averaged"].describe()


In [None]:
df.info()

### Distribution of labels

In [None]:
df['label'].value_counts().sort_index().plot(kind='bar')
print(df['label'].value_counts().sort_index())
fixation_percentage = round((df['label'].value_counts().sort_index()[1] / df['label'].value_counts().sum()) * 100, 2)
saccade_percentage = round((df['label'].value_counts().sort_index()[2] / df['label'].value_counts().sum()) * 100, 2)

print("Percentage of fixations: ", fixation_percentage, "%")
print("Percentage of saccades: ", saccade_percentage, "%")


### Distribution of labels when only looking at fixations and saccades

In [None]:
df_na_dropped = df.dropna()
print(df)
# Reassign labels 3 and 4 to label 1
df_na_dropped.loc[:, 'label'] = df_na_dropped['label'].replace({3: 1, 4: 1})

# Remove samples with label 1 and 5
df_na_dropped = df_na_dropped[(df_na_dropped['label'] != 5) & (df_na_dropped['label'] != 0)]

df_na_dropped['label'].value_counts().sort_index().plot(kind='bar')
print(df_na_dropped['label'].value_counts().sort_index())
fixation_percentage = round((df_na_dropped['label'].value_counts().sort_index()[1] / df_na_dropped['label'].value_counts().sum()) * 100, 2)
saccade_percentage = round((df_na_dropped['label'].value_counts().sort_index()[2] / df_na_dropped['label'].value_counts().sum()) * 100, 2)

print("Percentage of fixations: ", fixation_percentage, "%")
print("Percentage of saccades: ", saccade_percentage, "%")


In [None]:
features = df_na_dropped.drop(columns=['label',"x", "y","t", "status", "file_index", "file_name"])
features.describe()

## PCA analysis of features

In [None]:
# Perform PCA with 2 components
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Perform PCA on the normalized features
pca = PCA(n_components=2)
pca_components = pca.fit_transform(normalized_features)

# Create a scatter plot of the PCA components, color-coded by labels
plt.scatter(pca_components[:, 0], pca_components[:, 1], c=df_na_dropped["label"],alpha=0.5)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA Components')
plt.colorbar(label='Label')
plt.show()


In [None]:
import umap as umap

# Sample 1000 samples
sample_df = df_na_dropped.sample(n=10000, random_state=42)

# Perform UMAP with 2 components on the sampled data
umap_components = umap.UMAP(n_components=2).fit_transform(sample_df[features.columns])

# Create a scatter plot of the UMAP components, color-coded by labels
plt.scatter(umap_components[:, 0], umap_components[:, 1], c=sample_df["label"])
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.title('UMAP Components')
plt.colorbar(label='Label')
plt.show()


In [None]:

from sklearn.manifold import TSNE
sample_df = df_na_dropped.sample(n=10000, random_state=42)
normalized_features = scaler.fit_transform(sample_df[features.columns])


# Perform t-SNE with 2 components on the normalized features
tsne_components = TSNE(n_components=2).fit_transform(normalized_features)

# Create a scatter plot of the t-SNE components, color-coded by labels
plt.scatter(tsne_components[:, 0], tsne_components[:, 1], c=sample_df["label"])
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Components')
plt.colorbar(label='Label')
plt.show()


### Sns pairplot

In [None]:
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
# Concatenate the features and labels into a single DataFrame
df_concat = pd.concat([features, df_na_dropped['label']], axis=1)

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(df_concat.iloc[:, :-1])

# Create a DataFrame with the normalized features and label
normalized_df = pd.DataFrame(normalized_features, columns=df_concat.columns[:-1])
normalized_df['label'] = df_concat['label']

# Randomly extract a subset of the data
subset_df = normalized_df.sample(n = 5000, random_state=42)

# Create a pairplot
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sns.pairplot(subset_df, hue='label')


## Print correlations between features

In [None]:
correlation_matrix = features.corr()
correlation_ranking = correlation_matrix.unstack().sort_values(ascending=False)
correlation_ranking = correlation_ranking[correlation_ranking != 1]
pd.options.display.max_rows = 4000
print(correlation_ranking[1:400])


## Plot distributions of different features for saccades and fixations 

In [None]:
import matplotlib.pyplot as plt

# Get the list of feature column names
feature_columns = list(features.columns)

# Set the number of rows and columns for the subplots
num_rows = len(feature_columns)
num_cols = 1

# Create subplots for each feature
fig, axs = plt.subplots(num_rows, num_cols, figsize=(10, 5*num_rows))

# Iterate over the feature columns
for i, feature in enumerate(feature_columns):
    # Extract samples where the label is 1
    label_1_samples = df_na_dropped[df_na_dropped['label'] == 1]

    # Extract samples where the label is 2
    label_2_samples = df_na_dropped[df_na_dropped['label'] == 2]

    # Plot the distribution of the feature for label 1 samples
    axs[i].hist(label_1_samples[feature], bins=100, alpha=0.5, label='Fixations', density=True)
    axs[i].hist(label_2_samples[feature], bins=100, alpha=0.5, label='Saccades', density=True)
    axs[i].set_xlabel(feature)
    axs[i].set_ylabel('Normalized Frequency')
    axs[i].set_title(f'Distribution of {feature} for Fixations and Saccade Samples')
    #axs[i].set_yscale('log')
    axs[i].legend()

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()


# Variable importance with random forest


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
rf_classifier = RandomForestClassifier()

# Fit the classifier to your data
rf_classifier.fit(features, df_na_dropped['label'])

# Get the feature importances
importances = rf_classifier.feature_importances_

# Create a dataframe with feature names and importances
feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': importances})

# Sort the dataframe by importance in descending order
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(feature_importances)
plt.barh(feature_importances['Feature'], feature_importances['Importance'])


## Plot features

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
import numpy as np

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_na_dropped.select_dtypes(include=np.number))

# Create a new DataFrame with the scaled features
df_scaled = pd.DataFrame(scaled_features, columns=df_na_dropped.select_dtypes(include=np.number).columns)

# Create a scatter plot for each numeric column in df_na_dropped
fig = go.Figure()
xx = np.array(range(len(df_na_dropped['x'])))
for column in df_na_dropped.select_dtypes(include=np.number).columns:
    fig.add_trace(go.Scatter(x=xx, y=df_scaled[column], name=column))

# Update layout
fig.update_layout(title='Plot of Numeric Features (Scaled)',
                  xaxis_title='Time',
                  yaxis_title='Scaled Value',
                  showlegend=True)

# Show the plot
fig.show()

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
import numpy as np

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_na_dropped.select_dtypes(include=np.number))

# Create a new DataFrame with the scaled features
df_scaled = pd.DataFrame(scaled_features, columns=df_na_dropped.select_dtypes(include=np.number).columns)

# Create subplots with shared x-axis
fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

# Add a trace for each numeric column in df_na_dropped to the first subplot
xx = np.array(range(len(df_na_dropped['x'])))
for column in df_na_dropped.select_dtypes(include=np.number).columns:
    fig.add_trace(go.Scatter(x=xx, y=df_scaled[column], name=column), row=1, col=1)

# Add a trace for each numeric column in df_na_dropped to the second subplot
for column in df_na_dropped.select_dtypes(include=np.number).columns:
    fig.add_trace(go.Scatter(x=xx, y=df_scaled[column], name=column), row=2, col=1)

# Update layout for the first subplot
fig.update_layout(title='Plot of Numeric Features (Scaled)',
                  xaxis_title='Time',
                  yaxis_title='Scaled Value', 
                  showlegend=True)

# Update layout for the second subplot
fig.update_layout(xaxis2=dict(matches='x'),
                  yaxis2=dict(showticklabels=False),
                  xaxis3=dict(matches='x'),
                  yaxis3=dict(showticklabels=False))

fig.show()