In [None]:
#!pip3 install dash

In [None]:
#!pip3 install dash_bootstrap_components

In [None]:
#!pip3 install dash_bootstrap_templates

In [None]:
#!pip3 install pandas

In [None]:
#!pip3 install dash-auth

In [None]:
#!pip3 install xgboost

In [None]:
#!pip3 install shap

In [None]:
#!pip3 install mlxtend

In [None]:
#!pip3 install prefixspan

In [None]:
#!pip3 install lifelines

In [None]:
#!pip3 install pm4py

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import plotly.graph_objects as go
import dash_bootstrap_components as dbc
import dash_auth
import dash_table
from dash import Input, Output, dcc, html
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
from scipy.spatial import distance
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import shap
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from collections import defaultdict
from collections import Counter
from prefixspan import PrefixSpan
from lifelines import KaplanMeierFitter
import pm4py
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer

In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv('xapi_data.csv')

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
# Extracting Date
df['date'] = df['timestamp'].dt.date

# Extracting Time
df['time'] = df['timestamp'].dt.time

In [None]:
def get_timeframe(t):
    if 0 <= t.hour < 8:
        return "00:00-08:00"
    elif 8 <= t.hour < 16:
        return "08:00-16:00"
    else:
        return "16:00-24:00"

In [None]:
df['timeframe'] = df['time'].apply(get_timeframe)

In [None]:
# Display the first few rows of the DataFrame
df.head()

In [None]:
# Create a dataframe for clustering

# Sort the data by actor_mbox and timestamp
sorted_data_cl = df.sort_values(by=['actor_mbox', 'timestamp'])

# Calculate the time difference between consecutive rows
sorted_data_cl['time_diff'] = sorted_data_cl.groupby('actor_mbox')['timestamp'].diff()

# Set time differences greater than 30 minutes to NaT (Not a Time)
threshold = pd.Timedelta(minutes=30)
sorted_data_cl.loc[sorted_data_cl['time_diff'] > threshold, 'time_diff'] = pd.NaT

# Activity Count
activity_count_cl = df.groupby('actor_mbox').size().reset_index(name='activity_count')

# Verb Types
verb_types_cl = df.groupby(['actor_mbox', 'verb_display_en']).size().reset_index(name='verb_count')
verb_types_cl = verb_types_cl.pivot(index='actor_mbox', columns='verb_display_en', values='verb_count').reset_index().fillna(0)

# Time Spent
def compute_total_time(group):
    return group['time_diff'].sum(skipna=True)

time_spent_cl = sorted_data_cl.groupby('actor_mbox').apply(compute_total_time).reset_index()
time_spent_cl.columns = ['actor_mbox', 'total_time_spent']

# Diversity of Interactions
diversity_of_interactions_cl = df.groupby('actor_mbox')['obj_id'].nunique().reset_index()
diversity_of_interactions_cl.columns = ['actor_mbox', 'unique_materials_count']

# Merging all the dataframes together for a complete view
cl_df = activity_count_cl.merge(verb_types_cl, on='actor_mbox', how='outer')
cl_df = cl_df.merge(time_spent_cl, on='actor_mbox', how='outer')
cl_df = cl_df.merge(diversity_of_interactions_cl, on='actor_mbox', how='outer')
cl_df['total_time_spent'] = cl_df['total_time_spent'].dt.total_seconds()/3600

In [None]:
cl_df

In [None]:
basic_metrics_cl_df = ['activity_count', 'total_time_spent', 'unique_materials_count']

In [None]:
# Update the boxplot graph with further improvements
fig_boxplot_basic = go.Figure()

# Add boxplots for each variable
for var in basic_metrics_cl_df:
    fig_boxplot_basic.add_trace(go.Box(
        y=cl_df[var], 
        name=var,
        boxpoints='all',  # Display all points
        jitter=0.4,  # Spread them out so they don't overlap
        pointpos=-1.8,  # Position points to the side of the boxplot
        marker=dict(size=4, opacity=0.6),  # Slightly larger points with some transparency
        line=dict(width=1)
    ))

fig_boxplot_basic.update_layout(
    title_text="Boxplots for Key Engagement Variables",  # Title
    title_font_size=20,
    title_x=0.5,  # Center the title
    xaxis=dict(
        title='Variables',  # X-axis label
        titlefont_size=16,
        tickangle=45,  # Rotate labels for better readability
        tickfont_size=12,
        automargin=True,  # Ensure there's enough margin for the tick labels
    ),
    yaxis=dict(
        title='Values',  # Y-axis label
        titlefont_size=16,
        tickfont_size=12,
        autorange=True,  # Automatically adjust the range of the y-axis
    ),
    hovermode="closest",  # Show tooltip for the closest point
    showlegend=False,  # Hide legend if not necessary
    plot_bgcolor='white',  # Set background to white for a clean look
    boxmode='group'  # Group boxplots when they have the same x coordinate
)

# Add hovertemplate for more detailed information on hover
for trace in fig_boxplot_basic.data:
    trace.hovertemplate = '%{y}<extra>%{name}</extra>'

# Show the figure
fig_boxplot_basic.show()


In [None]:
info_text_clustering = """
In our clustering analysis of xAPI statement data, we employed three distinct techniques: DBSCAN, K-means, and Hierarchical Clustering. Each method segmented learners based on three key metrics: the count of activities undertaken, the total time spent on materials, and the count of unique materials interacted with. DBSCAN helped us identify outliers and the core groups of learners with similar activity patterns, highlighting those who deviated significantly from the norm. K-means clustering partitioned the learners into distinct groups, optimizing for intra-cluster similarity and inter-cluster differences, revealing common engagement profiles among learners. Hierarchical clustering provided a dendrogram that depicted the relationships between learners, allowing us to visualize the data's natural structure and determine a hierarchy of learner groups. This multi-faceted clustering approach not only categorized learners based on their engagement levels but also offered insights into the diversity of their learning interactions, and the depth of their engagement, thereby informing targeted interventions for enhanced educational outcomes.
"""
info_text_shap="""
In our analysis, SHAP values provided an in-depth look at the impact of each feature on the clustering outcomes derived from the xAPI data. By applying SHAP to the K-means and Hierarchical clustering models, we could quantify the influence of 'activity count', 'total time spent', and 'unique materials count' on the learners' assignment to different clusters. To visualize this impact, we created custom beeswarm plots using Plotly, where each point represents a SHAP value for a feature across all instances. Larger SHAP values indicate a greater influence on the model's output. These plots were constructed as box plots with all data points displayed, providing a clear, interactive visualization of the distribution of SHAP values for each feature. The boxplot’s spread and skewness gave insights into the consistency of each feature's impact across different learners. With this approach, we could discern which features strongly guided the clustering process, thus offering an interpretable overview of the model's decision-making process. These visualizations, embedded in our Dash application, allowed for an interactive exploration of the model's explanations, making the results accessible and understandable to stakeholders.
"""

In [None]:
# Columns to normalize
# all the columns ['activity_count', 'total_time_spent', 'unique_materials_count'] + list(df['verb_display_en'].unique())
cols_to_normalize = ['activity_count', 'total_time_spent', 'unique_materials_count']
cols_to_normalize = [col for col in cols_to_normalize if col == col]

In [None]:
cols_to_normalize

In [None]:
# Check for NaN values in the final_df dataframe
nan_counts = cl_df[cols_to_normalize].isna().sum()

# Display columns with NaN values and their counts
nan_columns = nan_counts[nan_counts > 0]
nan_columns

In [None]:
verbs_list=list(df['verb_display_en'].unique())
verbs_list = [col for col in verbs_list if col == col]

In [None]:
print(verbs_list)

In [None]:
# Create a figure
fig_boxplot_verbs = go.Figure()

# Add boxplots for each variable
for verb in verbs_list:
    fig_boxplot_verbs.add_trace(go.Box(y=cl_df[verb], name=verb))

# Update the boxplot graph with improvements for readability and information
fig_boxplot_verbs.update_traces(
    marker=dict(outliercolor='rgba(219, 64, 82, 0.6)', line=dict(outliercolor='rgba(219, 64, 82, 0.6)', outlierwidth=2)),
    boxmean=True,  # Represent the mean of the data
    boxpoints='outliers'  # show only outliers
)

fig_boxplot_verbs.update_layout(
    title_text="Distribution of Verb Interactions",  # Title
    title_font_size=20,
    title_x=0.5,  # Center the title
    xaxis=dict(
        title='Verb Types',  # X-axis label
        titlefont_size=16,
        tickangle=45,  # Rotate labels for better readability
        tickfont_size=12,
    ),
    yaxis=dict(
        title='Frequency',  # Y-axis label
        titlefont_size=16,
        tickfont_size=12,
    ),
    hovermode="closest",  # Show tooltip for the closest point
    showlegend=False,  # Hide legend if not necessary
    plot_bgcolor='white',  # Set background to white for a clean look
)

# Add hovertemplate for more detailed information on hover
for trace in fig_boxplot_verbs.data:
    trace.hovertemplate = '%{y} %{x}<extra></extra>'

fig_boxplot_verbs.show()

In [None]:
# Initialize the Min-Max Scaler or the StandardScaler (mean=0 and variance=1)
scaler = MinMaxScaler()
# Apply Min-Max scaling
cl_df[cols_to_normalize] = scaler.fit_transform(cl_df[cols_to_normalize])

In [None]:
# Create a DBSCAN instance
dbscan = DBSCAN(eps=0.5, min_samples=5)  # eps and min_samples are key parameters

# Fit the data
clusters = dbscan.fit_predict(cl_df[cols_to_normalize])

cl_df['dbscan_cluster'] = clusters

In [None]:
# Extracting numerical columns for PCA
data_for_pca = cl_df[cols_to_normalize].fillna(0)

# Apply PCA and reduce dimensions to 3
pca = PCA(n_components=2)
principal_components_cl = pca.fit_transform(data_for_pca)

# Convert the principal components to a DataFrame
pc_cl_df = pd.DataFrame(data=principal_components_cl, columns=['PC1', 'PC2'])

In [None]:
# Add cluster labels to the pc_df
pc_cl_df['dbscan_cluster'] = cl_df['dbscan_cluster']

In [None]:
# 2D Visualization using Plotly's graph_objects
# Initialize lists to store circle properties
circle_centers = []
circle_radii = []

# Calculate centroid and furthest distance for each cluster
for cluster_num in pc_cl_df['dbscan_cluster'].unique():
    cluster_data = pc_cl_df[pc_cl_df['dbscan_cluster'] == cluster_num]
    
    # Calculate the centroid of the cluster
    centroid = [cluster_data['PC1'].mean(), cluster_data['PC2'].mean()]
    circle_centers.append(centroid)
    
    # Calculate the radius as the maximum distance from the centroid
    radii = cluster_data.apply(lambda row: distance.euclidean(centroid, [row['PC1'], row['PC2']]), axis=1)
    circle_radii.append(radii.max())

# Plot the clusters and circles
fig_dbscan = go.Figure()

# Add scatter plot for each cluster
for cluster_num in pc_cl_df['dbscan_cluster'].unique():
    cluster_data = pc_cl_df[pc_cl_df['dbscan_cluster'] == cluster_num]
    fig_dbscan.add_trace(go.Scatter(x=cluster_data['PC1'], y=cluster_data['PC2'],
                                                 mode='markers', name=f'Cluster {cluster_num}'))

# Add circle shapes for each cluster
for center, radius in zip(circle_centers, circle_radii):
    fig_dbscan.add_shape(
        type="circle",
        xref="x", yref="y",
        x0=center[0] - radius, y0=center[1] - radius,
        x1=center[0] + radius, y1=center[1] + radius,
        opacity=0.2,
        fillcolor="blue",
        line_color="blue",
    )

# Add labels and title
fig_dbscan.update_layout(title="DBSCAN Clusters Visualization",
                                      xaxis_title="Principal Component 1",
                                      yaxis_title="Principal Component 2")

fig_dbscan.show()

In [None]:
# Determine the optimal number of clusters using the Elbow Method
inertia = []
possible_clusters = range(1, 15)  # Checking for up to 14 clusters

for k in possible_clusters:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42).fit(data_for_pca)
    inertia.append(kmeans.inertia_)

# Plotting the Elbow Method
elbow_fig = px.line(x=possible_clusters, y=inertia, title='Elbow Method for Optimal Number of Clusters', 
                    labels={'x': 'Number of Clusters', 'y': 'Inertia'})
elbow_fig.show()

In [None]:
# Assuming an arbitrary number of clusters for demonstration
k = 2  # This can be changed based on the Elbow Method result

# Apply K-means clustering
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
clusters_kmeans = kmeans.fit_predict(data_for_pca)

# Add K-means cluster labels to the pc_df
pc_cl_df['kmeans_cluster'] = clusters_kmeans

In [None]:
# 2D Visualization using Plotly's graph_objects
# Initialize lists to store circle properties for K-means clusters
circle_centers_kmeans = []
circle_radii_kmeans = []

# Calculate centroid and furthest distance for each K-means cluster
for cluster_num in pc_cl_df['kmeans_cluster'].unique():
    cluster_data = pc_cl_df[pc_cl_df['kmeans_cluster'] == cluster_num]
    
    # Calculate the centroid of the cluster
    centroid = [cluster_data['PC1'].mean(), cluster_data['PC2'].mean()]
    circle_centers_kmeans.append(centroid)
    
    # Calculate the radius as the maximum distance from the centroid
    radii = cluster_data.apply(lambda row: distance.euclidean(centroid, [row['PC1'], row['PC2']]), axis=1)
    circle_radii_kmeans.append(radii.max())

# Plot the K-means clusters and circles
fig_kmeans = go.Figure()

# Add scatter plot for each cluster
for cluster_num in pc_cl_df['kmeans_cluster'].unique():
    cluster_data = pc_cl_df[pc_cl_df['kmeans_cluster'] == cluster_num]
    fig_kmeans.add_trace(go.Scatter(x=cluster_data['PC1'], y=cluster_data['PC2'],
                                                 mode='markers', name=f'Cluster {cluster_num}'))

# Add circle shapes for each cluster
for center, radius in zip(circle_centers_kmeans, circle_radii_kmeans):
    fig_kmeans.add_shape(
        type="circle",
        xref="x", yref="y",
        x0=center[0] - radius, y0=center[1] - radius,
        x1=center[0] + radius, y1=center[1] + radius,
        opacity=0.2,
        fillcolor="blue",
        line_color="blue",
    )

# Add labels and title
fig_kmeans.update_layout(title="K-means Clusters Visualization",
                                      xaxis_title="Principal Component 1",
                                      yaxis_title="Principal Component 2")

fig_kmeans.show()

In [None]:
# Explaining K-Means
# Prepare data and labels for K-means clusters
X = data_for_pca
y_kmeans = pc_cl_df['kmeans_cluster']

# Split data into train and test sets for K-means clusters
X_train_kmeans, X_test_kmeans, y_train_kmeans, y_test_kmeans = train_test_split(X, y_kmeans, test_size=0.2, random_state=42)

# Train the XGBoost model for K-means clusters
xgb_model_kmeans = xgb.XGBClassifier()
xgb_model_kmeans.fit(X_train_kmeans, y_train_kmeans)

# Predict on the test set for K-means clusters
y_pred_kmeans = xgb_model_kmeans.predict(X_test_kmeans)

# Compute the accuracy for K-means clusters
accuracy_kmeans = accuracy_score(y_test_kmeans, y_pred_kmeans)
accuracy_kmeans

In [None]:
# Initialize the JS visualization code (for Jupyter Notebook)
shap.initjs()

# Create a SHAP explainer
explainer_kmeans = shap.Explainer(xgb_model_kmeans)

# Compute SHAP values for a sample of data (for performance reasons)
shap_values_kmeans = explainer_kmeans.shap_values(X_test_kmeans)

# Visualize the SHAP values for a specific instance
shap.force_plot(explainer_kmeans.expected_value, shap_values_kmeans[0,:], X_test_kmeans.iloc[0,:])

In [None]:
shap.summary_plot(shap_values_kmeans, X_test_kmeans)

In [None]:
# Get the SHAP values for all instances
all_shap_values = shap_values_kmeans
features = X_test_kmeans.columns

# Create a beeswarm plot for all instances using Plotly
traces = []
for idx, feature in enumerate(features):
    traces.append(
        go.Box(
            y=[feature] * all_shap_values.shape[0],
            x=all_shap_values[:, idx],
            name=feature,
            boxpoints='all',  # represent all points
            jitter=0.5,  # spread out the points for better visibility
            pointpos=-2,  # position of the points
            marker=dict(size=7, opacity=0.6),
            line=dict(width=1),
            showlegend=False
        )
    )

# Create layout and figure
layout = go.Layout(
    title="SHAP Values for All Instances",
    xaxis_title="SHAP Value",
    yaxis_title="Feature",
    boxmode='group'
)
shap_summary_fig_kmeans = go.Figure(data=traces, layout=layout)

shap_summary_fig_kmeans.show()

In [None]:
# Compute the linkage matrix
Z = linkage(data_for_pca, method='ward')

In [None]:
# Plotting a truncated dendrogram
plt.figure(figsize=(10, 6))
dendrogram(Z, truncate_mode='lastp', p=30, show_leaf_counts=True)
plt.title("Truncated Dendrogram")
plt.ylabel("Euclidean distances")
plt.xlabel("Sample index or (Cluster Size)")
plt.show()

In [None]:
clusters_hierarchical = cut_tree(Z, n_clusters=3).flatten()
pc_cl_df['hierarchical_cluster'] = clusters_hierarchical

In [None]:
# 2D Visualization using Plotly's graph_objects
# Initialize lists to store circle properties for hierarchical clusters
circle_centers_hierarchical = []
circle_radii_hierarchical = []

# Calculate centroid and furthest distance for each hierarchical cluster
for cluster_num in pc_cl_df['hierarchical_cluster'].unique():
    cluster_data = pc_cl_df[pc_cl_df['hierarchical_cluster'] == cluster_num]
    
    # Calculate the centroid of the cluster
    centroid = [cluster_data['PC1'].mean(), cluster_data['PC2'].mean()]
    circle_centers_hierarchical.append(centroid)
    
    # Calculate the radius as the maximum distance from the centroid
    radii = cluster_data.apply(lambda row: distance.euclidean(centroid, [row['PC1'], row['PC2']]), axis=1)
    circle_radii_hierarchical.append(radii.max())

# Plot the hierarchical clusters and circles
fig_h_cl = go.Figure()

# Add scatter plot for each cluster
for cluster_num in pc_cl_df['hierarchical_cluster'].unique():
    cluster_data = pc_cl_df[pc_cl_df['hierarchical_cluster'] == cluster_num]
    fig_h_cl.add_trace(go.Scatter(x=cluster_data['PC1'], y=cluster_data['PC2'],
                                               mode='markers', name=f'Cluster {cluster_num}'))

# Add circle shapes for each cluster
for center, radius in zip(circle_centers_hierarchical, circle_radii_hierarchical):
    fig_h_cl.add_shape(
        type="circle",
        xref="x", yref="y",
        x0=center[0] - radius, y0=center[1] - radius,
        x1=center[0] + radius, y1=center[1] + radius,
        opacity=0.2,
        fillcolor="blue",
        line_color="blue",
    )

# Add labels and title
fig_h_cl.update_layout(title="Hierarchical Clusters Visualization",
                                    xaxis_title="Principal Component 1",
                                    yaxis_title="Principal Component 2")

fig_h_cl.show()

In [None]:
# Explaining Hierarchical Cluster
# Prepare data and labels for hierarchical clusters
X = data_for_pca
y_hierarchical = pc_cl_df['hierarchical_cluster']

# Split data into train and test sets for hierarchical clusters
X_train_hierarchical, X_test_hierarchical, y_train_hierarchical, y_test_hierarchical = train_test_split(X, y_hierarchical, test_size=0.2, random_state=42)

# Train the XGBoost model for hierarchical clusters
xgb_model_hierarchical = xgb.XGBClassifier()
xgb_model_hierarchical.fit(X_train_hierarchical, y_train_hierarchical)

# Predict on the test set for hierarchical clusters
y_pred_hierarchical = xgb_model_hierarchical.predict(X_test_hierarchical)

# Compute the accuracy for hierarchical clusters
accuracy_hierarchical = accuracy_score(y_test_hierarchical, y_pred_hierarchical)
accuracy_hierarchical


In [None]:
# Create an explainer for the XGBoost model
explainer_hierarchical = shap.Explainer(xgb_model_hierarchical)

# Compute SHAP values for the test set
shap_values_hierarchical = explainer_hierarchical.shap_values(X_test_hierarchical)

# Generate the SHAP summary plot
shap.summary_plot(shap_values_hierarchical, X_test_hierarchical)

In [None]:

# Get the SHAP values for all instances
all_shap_values_hierarchical = np.array(shap_values_hierarchical)
features = X_test_hierarchical.columns

# Create a beeswarm plot for all instances using Plotly
traces_hierarchical = []
for idx, feature in enumerate(features):
    traces_hierarchical.append(
        go.Box(
            y=[feature] * all_shap_values_hierarchical.shape[0],
            x=all_shap_values_hierarchical[:, idx],
            name=feature,
            boxpoints='all',  # represent all points
            jitter=0.5,  # spread out the points for better visibility
            pointpos=-2,  # position of the points
            marker=dict(size=6, opacity=0.6),  # Larger marker size for visibility
            line=dict(width=1),
            showlegend=False
        )
    )

# Create layout and figure for hierarchical clustering
layout_hierarchical = go.Layout(
    title="SHAP Values for All Instances (Hierarchical Clustering)",
    xaxis_title="SHAP Value",
    yaxis_title="Feature",
    boxmode='group'
)
shap_summary_fig_hierarchical = go.Figure(data=traces_hierarchical, layout=layout_hierarchical)

shap_summary_fig_hierarchical.show()

In [None]:
info_text_arm = """
In this analysis, Association Rule Mining has been used to discover interesting relationships between variables in the dataset. The rules generated by this method have uncovered patterns and associations that are not readily apparent. This method has provided valuable insights into how different actions are related to each other, enabling us to understand the co-occurrence of different events within the dataset.
"""

In [None]:
# Implement Association Rule Mining 
# Aggregating the actions (verbs) for each user (actor_mbox)
# We will create a dictionary where each key is a user's email and each value is a list of verbs they have performed

# Initialize an empty dictionary to hold the transactions
transactions_dict = {}

# Loop over each row in the DataFrame
for index, row in df.iterrows():
    # Use actor_mbox as the key
    user = row['actor_mbox']
    # Use verb_display_en as the item to add to the transaction
    action = row['verb_display_en']
    
    # If the user is not in the dictionary, add them with a new list containing the action
    if user not in transactions_dict:
        transactions_dict[user] = [action]
    # If the user is already in the dictionary, append the action to their list of actions
    else:
        transactions_dict[user].append(action)

# Convert the dictionary to a list of transactions
transactions = list(transactions_dict.values())

# Let's see how many transactions we have and the first few of them
# len(transactions), transactions[:5]

In [None]:
# Convert all actions to strings
transactions = [[str(action) for action in user_actions] for user_actions in transactions]

# Instantiate the transaction encoder
te = TransactionEncoder()

# Fit and transform the transactions to a boolean array
te_ary = te.fit(transactions).transform(transactions)

# Convert the array back to a DataFrame with item names
df_transactions = pd.DataFrame(te_ary, columns=te.columns_)

# Apply the Apriori algorithm to find frequent itemsets with a support threshold of 0.01 (1%)
frequent_itemsets = apriori(df_transactions, min_support=0.01, use_colnames=True)

# Display the frequent itemsets
frequent_itemsets.sort_values(by='support', ascending=False).head()  # Show top 5 frequent itemsets

In [None]:
# Generate the association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Sort rules by the lift metric in descending order
rules = rules.sort_values(by='lift', ascending=False)

# Display the top association rules
rules.head()

In [None]:
# Assuming 'rules' is the DataFrame containing all the association rules

# Filter rules by confidence and lift
filtered_rules = rules[(rules['confidence'] >= 0.7) & (rules['lift'] >= 1.5)]

# Display the filtered rules
filtered_rules

In [None]:
# Assuming 'filtered_rules' is your DataFrame containing the filtered rules

# Create a scatter plot of support vs confidence
fig_a_r_m = go.Figure(data=[
    go.Scatter(
        x=filtered_rules['support'], 
        y=filtered_rules['confidence'],
        text=filtered_rules['antecedents'].astype(str) + ' -> ' + filtered_rules['consequents'].astype(str),
        mode='markers',
        marker=dict(
            size=filtered_rules['lift'] * 10,  # Multiply by a factor to scale the lift values for better visibility
            color=filtered_rules['lift'],
            showscale=True,  # Show color scale
            colorbar=dict(title='Lift'),
        )
    )
])

# Set the title and axis labels
fig_a_r_m.update_layout(
    title='Association Rules Scatter Plot',
    xaxis_title='Support',
    yaxis_title='Confidence',
    hovermode='closest'  # Show closest point on hover
)

# Add hover text
fig_a_r_m.update_traces(
    hovertemplate='Rule: %{text}<br>Support: %{x}<br>Confidence: %{y}<br>Lift: %{marker.color:.2f}'
)

# Show the figure
fig_a_r_m.show()

In [None]:
info_text_sequence = """
A sequence diagram tracks the order of learner actions, providing insight into the flow of learning activities. Nodes represent actions, links show progression, and link thickness reflects the frequency of transitions.This helps identify common learning paths and key engagement points in the educational material.
"""

In [None]:
# Sequence Analysis
# Convert the timestamp to datetime to ensure proper sorting
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort the DataFrame by learner and timestamp
df_sorted = df.sort_values(['actor_mbox', 'timestamp'])

# Create sequences of verbs for each learner
sequences = df_sorted.groupby('actor_mbox')['verb_display_en'].apply(list)

# Display the first few sequences
sequences.head()

In [None]:
# Since the computation is time-consuming, we will reduce the dataset size by sampling a subset of learners.
# We will also remove any infrequent verbs to reduce the sequence complexity.

# Sampling a subset of learners
sampled_sequences = sequences.sample(frac=0.1, random_state=1).tolist()  # Sample 10% of the sequences

# Removing infrequent verbs
# Let's find the frequency of each verb
verb_frequencies = df['verb_display_en'].value_counts()

# Filter out verbs that appear less than a certain threshold (e.g., less than 5 times)
threshold = 5
frequent_verbs = verb_frequencies[verb_frequencies >= threshold].index.tolist()

# Now filter the sequences to include only frequent verbs
filtered_sequences = [[verb for verb in seq if verb in frequent_verbs] for seq in sampled_sequences]

# Re-run the PrefixSpan with the filtered data and higher minimum support
ps = PrefixSpan(filtered_sequences)
min_support = 5  # Increase the minimum support
frequent_sequences = ps.frequent(min_support)
# frequent_sequences 

In [None]:
# Use only the top 10 sequences for visualization
top_sequences = sorted(frequent_sequences, reverse=True)[:10]

# Convert sequences to abbreviated strings and extract supports
sequence_strs = [' -> '.join(seq[:3]) + ('...' if len(seq) > 3 else '') for _, seq in top_sequences]
supports = [support for support, _ in top_sequences]

# Create a bar chart
fig_prefixscan = go.Figure(
    data=[go.Bar(
        x=sequence_strs,
        y=supports
    )]
)

# Update layout for readability
fig_prefixscan.update_layout(
    title='Top 10 Frequent Sequences',
    xaxis_title='Sequence',
    yaxis_title='Support Count',
    xaxis_tickangle=-45  # Rotate labels for better visibility
)

# Show the figure
fig_prefixscan.show()

In [None]:
# Remove the nan value from the unique verbs and verb_indices
unique_verbs = pd.Series(df_sorted['verb_display_en'].unique()).sort_values()
unique_verbs = unique_verbs.dropna()
verb_indices = {verb: i for i, verb in enumerate(unique_verbs)}

# Reset the source-target pairs and frequencies without the nan values
source_target_freq = defaultdict(int)

# Iterate over each sequence and update the source-target frequencies, ensuring no nan values are included
for sequence in sequences:
    sequence = [verb for verb in sequence if verb in verb_indices]  # Exclude verbs not in verb_indices
    for i in range(len(sequence) - 1):
        src = verb_indices[sequence[i]]
        tgt = verb_indices[sequence[i+1]]
        source_target_freq[(src, tgt)] += 1

# Now we have the source-target pairs with frequencies, we can create lists for sources, targets, and weights
sources = []
targets = []
weights = []

for (src, tgt), freq in source_target_freq.items():
    sources.append(src)
    targets.append(tgt)
    weights.append(freq)

# Create a list of verb labels in the same order as the indices
labels = list(unique_verbs)

# Now we can proceed to create the Sankey diagram with this data.
# (sources, targets, weights, labels)

In [None]:
#Sankey diagram with Plotly

# Create the Sankey diagram figure
fig_sankey = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels
    ),
    link=dict(
        source=sources,
        target=targets,
        value=weights
    )
)])

# Set the title and layout
fig_sankey.update_layout(title_text='Learner Actions Sequence Sankey Diagram', font_size=10)

# Show the figure
fig_sankey.show()


In [None]:
info_text_time_based_analysis="""
In our Time-Based Patterns Analysis, we delved into the temporal dynamics of learners' activities, focusing on the timing and duration of interactions. By sorting the xAPI data chronologically for each learner, we calculated the time intervals between consecutive actions, allowing us to identify periods of intense activity as well as pauses in engagement. This analysis was instrumental in distinguishing between individual sessions, where we considered any gap longer than one hour as the beginning of a new session. By aggregating these time differences across different types of interactions (verbs), we gained insights into the rhythms of learning behaviors. For instance, certain actions like 'accessed' or 'attempted' may have shorter intervals, suggesting rapid engagement, whereas others, such as 'completed', could indicate longer periods of focus. The visualization of these temporal patterns aided in understanding the ebb and flow of learners' engagement over time, providing a nuanced view of the learning process beyond mere activity counts.
"""

In [None]:
# Time-Based Patterns Analysis
# Group the data by learner and sort by timestamp
df_sorted = df.sort_values(['actor_mbox', 'timestamp'])

# Calculate the time difference between subsequent actions for each user
df_sorted['time_diff'] = df_sorted.groupby('actor_mbox')['timestamp'].diff().dt.total_seconds() / 3600  # Convert to hours

# Now, we will consider a new session if the time difference exceeds 1 hour
# We create a session identifier that increments every time there is a break of more than 1 hour
df_sorted['session_id'] = (df_sorted['time_diff'] > 1).astype(int).cumsum()

# Finally, we calculate the duration in hours within each session
df_sorted['duration_within_session_hours'] = df_sorted.groupby(['actor_mbox', 'session_id'])['timestamp'].diff().dt.total_seconds() / 3600

# Fill NaN values with zeros for the first action in each session
df_sorted['duration_within_session_hours'] = df_sorted['duration_within_session_hours'].fillna(0)

# Let's take a look at the result
df_sorted[['actor_mbox', 'timestamp', 'time_diff', 'session_id', 'duration_within_session_hours']].head()

In [None]:
# Calculate the average time difference for each verb within the sessions, ignoring the first action (where time_diff would be NaN)
avg_time_diff_per_verb_within_session = df_sorted[df_sorted['duration_within_session_hours'] > 0].groupby('verb_display_en')['duration_within_session_hours'].mean().reset_index()

# Visualize the average time difference for each verb using a bar chart
fig_time_based_patterns_within_session = px.bar(avg_time_diff_per_verb_within_session, x='verb_display_en', y='duration_within_session_hours',
             title='Average Time Difference Between Actions for Each Verb Within Sessions',
             labels={'verb_display_en': 'Verb', 'duration_within_session_hours': 'Average Time Difference (hours)'},
             color='duration_within_session_hours',
             color_continuous_scale=px.colors.sequential.Viridis)

# Rotate the x-axis labels for better readability
fig_time_based_patterns_within_session.update_layout(xaxis_tickangle=-45)

# Show the figure
fig_time_based_patterns_within_session.show()

In [None]:
# Survival analysis with Lifeline

# Identify the end event, assuming 'completed' is the end event
end_event = 'marked-completion'
df_sorted['event_occurred'] = df_sorted['verb_display_en'].apply(lambda x: 1 if x == end_event else 0)

# For each learner, keep only the record with the end event or the last record if the end event hasn't occurred
df_survival = df_sorted.sort_values(by=['actor_mbox', 'timestamp']).groupby('actor_mbox').last().reset_index()

# Fit the Kaplan-Meier curve
kmf = KaplanMeierFitter()
kmf.fit(df_survival['duration_within_session_hours'], event_observed=df_survival['event_occurred'])

# Extract the survival function data
kmf_data = kmf.survival_function_

# Create a Plotly figure
fig_lifelines = go.Figure()

# Add the survival curve to the figure
fig_lifelines.add_trace(go.Scatter(x=kmf_data.index, y=kmf_data["KM_estimate"],
                         mode='lines', name='Survival curve'))

# Add labels and title
fig_lifelines.update_layout(
    title="Kaplan-Meier Survival Curve (with 'marked-completion' as end event)",
    xaxis_title="Time (Hours)",
    yaxis_title="Survival Probability",
    template="plotly_white"
)

# Show the figure
fig_lifelines.show()

In [None]:
# Heuristics Miner is a process mining algorithm provided by the pm4py
# First, we will drop rows where 'verb_display_en' is NaN since these are essential for process mining.
df_clean = df.dropna(subset=['verb_display_en'])

# Now let's ensure that all 'verb_display_en' values are strings, even if they were NaN before.
df_clean['verb_display_en'] = df_clean['verb_display_en'].astype(str)

# Check if there are any NaN values left in the 'verb_display_en' column
nan_in_verb_display = df_clean['verb_display_en'].isnull().sum()

# Prepare the DataFrame for Heuristics Miner again
df_heuristics_clean = df_clean[['actor_mbox', 'verb_display_en', 'timestamp']].copy()
df_heuristics_clean.rename(columns={
    'actor_mbox': 'case:concept:name',
    'verb_display_en': 'concept:name',
    'timestamp': 'time:timestamp'
}, inplace=True)

# Check the data types to ensure they're correct
data_types = df_heuristics_clean.dtypes

df_heuristics_clean.head(), nan_in_verb_display, data_types

In [None]:
# Convert the DataFrame to an event log
log = log_converter.apply(df_heuristics_clean)

# Discover the process model using Heuristics Miner
heu_net = heuristics_miner.apply_heu(log)

# Print all attributes and methods of the HeuristicsNet object
print(dir(heu_net))

In [None]:
# Access the nodes (activities) in the HeuristicsNet
activities = heu_net.activities

# Access the dependencies in the HeuristicsNet
dependencies = heu_net.dependency_matrix

# Print activities and their occurrences
# print("Activities and Occurrences:")
# for activity, occurrence in heu_net.activities_occurrences.items():
#    print(f"Activity: {activity}, Occurrences: {occurrence}")

# Print dependencies
# print("\nDependencies Matrix:")
# print(dependencies)

# You can also access the Directly Follows Graph (DFG)
dfg = heu_net.dfg
# print("\nDirectly Follows Graph:")
# for (activity_from, activity_to), value in dfg.items():
#    print(f"From: {activity_from}, To: {activity_to}, Value: {value}")

In [None]:
# Visualize with Plotly
# We'll keep it simple and just plot a circular layout for the nodes
node_coords = {}
theta = 0
radius = 1
for i, activity in enumerate(activities):
    theta += 2 * 3.14159 / len(activities)  # distribute nodes evenly on a circle
    x = radius * 3 * (i % 2) * (-1 if i % 4 < 2 else 1)  # alternating x for visibility
    y = radius * (len(activities) / 4 - i // 2)  # stacked y for visibility
    node_coords[activity] = (x, y)

# Initialize figure
fig_pm4py = go.Figure()

# Add edges as lines
for (activity_from, activity_to), value in dfg.items():
    if activity_from in node_coords and activity_to in node_coords:
        fig_pm4py.add_trace(go.Scatter(
            x=[node_coords[activity_from][0], node_coords[activity_to][0]],
            y=[node_coords[activity_from][1], node_coords[activity_to][1]],
            mode='lines',
            line=dict(width=2, color='blue'),
            hoverinfo='none'
        ))

# Add nodes as markers
for activity, pos in node_coords.items():
    fig_pm4py.add_trace(go.Scatter(
        x=[pos[0]],
        y=[pos[1]],
        mode='markers+text',
        marker=dict(size=10, color='red'),
        text=activity,
        textposition="bottom center"
    ))

# Update layout
fig_pm4py.update_layout(
    title='Heuristics Net Visualization with pm4py',
    height=800,  # or any other value that suits your needs
    showlegend=False,
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)

# Show figure
fig_pm4py.show()

In [None]:
# Get value counts of the 'verb_display_en' column
verb_display_en_counts = df['verb_display_en'].value_counts()

# Create a bar chart
fig_1 = px.bar(verb_display_en_counts, x=verb_display_en_counts.index, y=verb_display_en_counts.values, title='Distribution of Verbs', labels={'x': 'Verbs', 'y': 'Count'})

# Show the plot
fig_1.show()

In [None]:
# Get value counts of the 'obj_type' column
obj_type_counts = df['obj_type'].value_counts()

# Create a bar chart
fig_2 = px.pie(obj_type_counts, names=obj_type_counts.index, values=obj_type_counts.values, title='Distribution of Activities')

# Show the plot
fig_2.show()

In [None]:
# Get value counts of the 'obj_type' column
platform_counts = df['platform'].value_counts()

# Create a bar chart
fig_3 = px.pie(platform_counts, names=platform_counts.index, values=platform_counts.values, title='Statements per Platform')

# Show the plot
fig_3.show()

In [None]:
# Get value counts of the 'language' column
language_counts = df['language'].value_counts()

# Create a bar chart
fig_4 = px.pie(language_counts, names=language_counts.index, values=language_counts.values, title='Languages')

# Show the plot
fig_4.show()

In [None]:
# Daily statements per day
daily_counts = df.groupby('date').size().reset_index(name='count')

In [None]:
# Compute trendline
z = np.polyfit(range(len(daily_counts['count'])), daily_counts['count'], 1)
p = np.poly1d(z)

In [None]:
fig_5 = go.Figure()

fig_5.add_trace(go.Scatter(x=daily_counts['date'], y=daily_counts['count'], mode='lines+markers', name='Original Data',
                         line=dict(dash='dashdot')))

# Trendline
fig_5.add_trace(go.Scatter(x=daily_counts['date'], y=p(range(len(daily_counts['count']))), mode='lines', name='Trendline',
                         line=dict(color='red')))

fig_5.update_layout(title='Statements per Day',
                  xaxis_title='Date',
                  yaxis_title='Count')
# Update the layout to add the range slider
fig_5.update_layout(
    xaxis=dict(
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)
fig_5.show()

In [None]:
# Get value counts of the 'timeframe' column
timeframe_counts = df['timeframe'].value_counts()

# Create a bar chart
fig_6 = px.pie(timeframe_counts, names=timeframe_counts.index, values=timeframe_counts.values, title='Stetements per Timeframe')

# Show the plot
fig_6.show()

In [None]:
# Define valid username and password pairs
VALID_USERNAME_PASSWORD_PAIRS = {
    'admin': 'password'
}

In [None]:
# Dropdown options from the 'platform' column
platform_options = df['platform'].unique()

# Dropdown options from the 'course' column
object_options = df['obj_def_name_en'].unique()

# Dropdown options from the 'actor_mbox' column
user_options = df['actor_mbox'].unique()

# Calculate the total number of unique users
total_unique_users = len(df['actor_mbox'].unique())

# Calculate the total number of instances which we collect data
total_instances = len(df['platform'].unique())

# Calculate the total number of statements which we collect data
total_statements = len(df['statement_ID'].unique())

# Calculate the total number of objects which we collect data
total_objects = len(df['obj_def_name_en'].unique())

In [None]:
app = dash.Dash(external_stylesheets=[dbc.themes.MORPH])

# the style arguments for the sidebar. We use position:fixed and a fixed width
SIDEBAR_STYLE = {
    "position": "fixed",
    "top": 0,
    "left": 0,
    "bottom": 0,
    "width": "16rem",
    "padding": "2rem 1rem",
    "background-color": "#f8f9fa",
}

# the styles for the main content position it to the right of the sidebar and
# add some padding.
CONTENT_STYLE = {
    "margin-left": "18rem",
    "margin-right": "2rem",
    "padding": "2rem 1rem",
}

sidebar = html.Div(
    [
        html.H2("Learning Analytics", className="display-4 center"),
        html.Hr(),
        html.P(
            "A Dashboard for presenting the data extracted from TRAX LRS", className="lead center"
        ),
        dbc.Nav(
            [
                dbc.NavLink("Home", href="/", active="exact"),
                dbc.NavLink("Metrics per Instance", href="/instance", active="exact"),
                dbc.NavLink("Metrics per Object", href="/object", active="exact"),
                dbc.NavLink("Metrics per User", href="/user", active="exact"),
            ],
            vertical=True,
            pills=True,
        ),
    ],
    style=SIDEBAR_STYLE,
)

content = html.Div(id="page-content", style=CONTENT_STYLE)

# Create a statistic card for the total unique users
stat_card_unique_users = dbc.Card([
    dbc.CardBody([
        html.H4("Total Users", className="card-title"),
        html.H2(str(total_unique_users), className="card-subtitle"),
    ])
], color="success", inverse=True)

# Create a statistic card for the total instances
stat_card_instances = dbc.Card([
    dbc.CardBody([
        html.H4("Total Instances", className="card-title"),
        html.H2(str(total_instances), className="card-subtitle"),
    ])
], color="info", inverse=True)

# Create a statistic card for the total statements
stat_card_statements = dbc.Card([
    dbc.CardBody([
        html.H4("Total Statements", className="card-title"),
        html.H2(str(total_statements), className="card-subtitle"),
    ])
], color="warning", inverse=True)

# Create a statistic card for the total objects
stat_card_objects = dbc.Card([
    dbc.CardBody([
        html.H4("Total Objects", className="card-title"),
        html.H2(str(total_objects), className="card-subtitle"),
    ])
], color="danger", inverse=True)

# Use BasicAuth to wrap around the app and handle authentication
auth = dash_auth.BasicAuth(
    app,
    VALID_USERNAME_PASSWORD_PAIRS
)

app.layout = html.Div([dcc.Location(id="url"), sidebar, content])


@app.callback(Output("page-content", "children"), [Input("url", "pathname")])
def render_page_content(pathname):
    if pathname == "/":
        return (
        dbc.Container(
        [html.H1("Overview", className="mb-4"),
        html.Hr(),  # This is the horizontal line
        dbc.Row([
        dbc.Col(stat_card_unique_users, width=6),
        dbc.Col(stat_card_instances, width=6),
        ],className="mb-4",
        ),
        dbc.Row([
        dbc.Col(stat_card_statements, width=6),
        dbc.Col(stat_card_objects, width=6),
        ],className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='statements-per-day-all', figure=fig_5), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='distribution-of-verbs-all', figure=fig_1), width=12),
            ],
            className="mb-4", 
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-boxplot-verbs', figure=fig_boxplot_verbs), width=12),
            ],
            className="mb-4", 
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-boxplot-basic', figure=fig_boxplot_basic), width=12),
            ],
            className="mb-4", 
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='distribution-per-activity-all', figure=fig_2), width=6),
                dbc.Col(dcc.Graph(id='statements-per-platform-all', figure=fig_3), width=6),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='languages-all', figure=fig_4), width=6),
                dbc.Col(dcc.Graph(id='statements-per-timeframe-all', figure=fig_6), width=6),
            ],
            className="mb-4",
        ),
        dbc.Row(
            dbc.Col(
                dbc.Card(
                    [
                        dbc.CardHeader("Clustering Analysis", className="font-weight-bold text-center"),
                        dbc.CardBody(
                            [
                                html.P(info_text_clustering, className="card-text"),
                                html.Br(),
                                html.P(info_text_shap, className="card-text")
                            ]
                        ),
                    ],
                    style={"margin-bottom": "20px", "marginTop": "20px", "backgroundColor": "#f8f9fa"},
                    className="w-100"
                ),
                width=12
            )
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-dbscan', figure=fig_dbscan), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-elbow', figure=elbow_fig), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-kmeans', figure=fig_kmeans), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-shap-kmeans', figure=shap_summary_fig_kmeans), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-hierarchical', figure=fig_h_cl), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-shap-hierarchical', figure=shap_summary_fig_hierarchical), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            dbc.Col(
                dbc.Card(
                    [
                        dbc.CardHeader("Association Rule Mining Insights", className="font-weight-bold text-center"),
                        dbc.CardBody(
                            [
                                html.P(info_text_arm, className="card-text")
                            ]
                        ),
                    ],
                    style={"margin-bottom": "20px", "marginTop": "20px", "backgroundColor": "#f8f9fa"},
                    className="w-100"
                ),
                width=12
            )
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-a-r-m', figure=fig_a_r_m), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(
                    dbc.Button(
                        "Association Rules Mining CSV", 
                        id="btn_csv", 
                        style={"backgroundColor": "navy", "color": "white"}, # Navy blue button with white text
                        className="mx-auto d-block"  # Center the button
                    ),
                    width=12  # Take up the full width to center the button
                ),
            ],
            className="mb-4",  # Add some margin below the row
        ),
        dcc.Download(id="download-dataframe-csv"),
        dbc.Row(
            dbc.Col(
                dbc.Card(
                    [
                        dbc.CardHeader("Sequence Insights", className="font-weight-bold text-center"),
                        dbc.CardBody(
                            [
                                html.P(info_text_sequence, className="card-text")
                            ]
                        ),
                    ],
                    style={"margin-bottom": "20px", "marginTop": "20px", "backgroundColor": "#f8f9fa"},
                    className="w-100"
                ),
                width=12
            )
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-prefixscan', figure=fig_prefixscan), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-sankey', figure=fig_sankey), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            dbc.Col(
                dbc.Card(
                    [
                        dbc.CardHeader("Time-Based Patterns Analysis", className="font-weight-bold text-center"),
                        dbc.CardBody(
                            [
                                html.P(info_text_time_based_analysis, className="card-text")
                            ]
                        ),
                    ],
                    style={"margin-bottom": "20px", "marginTop": "20px", "backgroundColor": "#f8f9fa"},
                    className="w-100"
                ),
                width=12
            )
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-time-based-patterns-within-session', figure=fig_time_based_patterns_within_session), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-lifelines', figure=fig_lifelines), width=12),
            ],
            className="mb-4",
        ),
        dbc.Row(
            [
                dbc.Col(dcc.Graph(id='fig-pm4py', figure=fig_pm4py), width=12),
            ],
            className="mb-4",
        )
    ], fluid=True  # Set to True for a full width container
        ))
    elif pathname == "/instance":
        return(
            dbc.Container([
            html.H1("Platform Metrics Visualization", className="mb-4"),
            html.Hr(),         
            dbc.Row([
                dbc.Col([
                    html.Label("Select Platform:"),
                    dcc.Dropdown(
                        id="platform-dropdown",
                        options=[{'label': platform, 'value': platform} for platform in platform_options],
                        value=[platform_options[0]],
                        multi=True  # allow multiple selection
                    )
                ], width=4)
            ], className="mb-4"),
            dbc.Row([
            dbc.Col(id='stat-card-unique-users-platform', width=4),
            dbc.Col(id='stat-card-statements-platform', width=4),
            dbc.Col(id='stat-card-objects-platform', width=4),
            ],className="mb-4",
            ), 
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='statements-per-day-platform'), width=12),
                ],
                className="mb-4",
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='distribution-of-verbs-platform'), width=12),
                ],
                className="mb-4", 
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='languages-platform'), width=6),
                    dbc.Col(dcc.Graph(id='statements-per-timeframe-platform'), width=6),
                ],
                className="mb-4",
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='distribution-per-activity-platform'), width=6),
                ],
                className="mb-4",
            )
        ])
        )
    elif pathname == "/object":
            return(
            dbc.Container([
            html.H1("Object Metrics Visualization", className="mb-4"),
            html.Hr(),  # This is the horizontal line 
            dbc.Row([
                dbc.Col([
                    html.Label("Select Object:"),
                    dcc.Dropdown(
                        id="object-dropdown",
                        options=[{'label': object, 'value': object} for object in object_options],
                        value=[object_options[0]],
                        multi=True  # allow multiple selection
                    )
                ], width=4)
            ], className="mb-4"),
            dbc.Row([
            dbc.Col(id='stat-card-unique-users-object', width=6),
            dbc.Col(id='stat-card-statements-object', width=6),
            ],className="mb-4",
            ), 
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='statements-per-day-object'), width=12),
                ],
                className="mb-4",
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='distribution-of-verbs-object'), width=12),
                ],
                className="mb-4", 
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='languages-object'), width=6),
                    dbc.Col(dcc.Graph(id='statements-per-timeframe-object'), width=6),
                ],
                className="mb-4",
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='distribution-per-activity-object'), width=6),
                ],
                className="mb-4",
            )
        ])
            )
    elif pathname == "/user":
            return(
            dbc.Container([
            html.H1("User Metrics Visualization", className="mb-4"),
            html.Hr(),  # This is the horizontal line 
            dbc.Row([
                dbc.Col([
                    html.Label("Select User(with email):"),
                    dcc.Dropdown(
                        id="user-dropdown",
                        options=[{'label': user, 'value': user} for user in user_options],
                        value=[user_options[0]],
                        multi=True  # allow multiple selection
                    )
                ], width=4)
            ], className="mb-4"),
            dbc.Row([
            dbc.Col(id='stat-card-instances-user', width=4),
            dbc.Col(id='stat-card-statements-user', width=4),
            dbc.Col(id='stat-card-objects-user', width=4),
            ],className="mb-4",
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='statements-per-day-user'), width=12),
                ],
                className="mb-4",
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='distribution-of-verbs-user'), width=12),
                ],
                className="mb-4", 
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='verbs-over-time-user'), width=12),
                ],
                className="mb-4", 
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='time-spent-user'), width=12),
                ],
                className="mb-4", 
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='languages-user'), width=6),
                    dbc.Col(dcc.Graph(id='statements-per-timeframe-user'), width=6),
                ],
                className="mb-4",
            ),
            dbc.Row(
                [
                    dbc.Col(dcc.Graph(id='distribution-per-activity-user'), width=6),
                    dbc.Col(dcc.Graph(id='statements-per-platform-user'), width=6),
                ],
                className="mb-4",
            ),
            dbc.Row([
                dbc.Col(id="user-data-table")
                ],
                className="mb-4",
            )
        ])
            )
    # If the user tries to reach a different page, return a 404 message
    return html.Div(
        [
            html.H1("404: Not found", className="text-danger"),
            html.Hr(),
            html.P(f"The pathname {pathname} was not recognised..."),
        ],
        className="p-3 bg-light rounded-3",
    )

# Callback for the button
@app.callback(
    Output("download-dataframe-csv", "data"),
    Input("btn_csv", "n_clicks"),
    prevent_initial_call=True
)
def download_csv(n_clicks):
    return dcc.send_data_frame(filtered_rules.to_csv, "filtered_rules.csv")

# Callbacks to update the platform plots
@app.callback(
    [Output("distribution-of-verbs-platform", "figure"),
     Output("statements-per-day-platform", "figure"),
     Output("distribution-per-activity-platform", "figure"),
     Output("languages-platform", "figure"),
     Output("statements-per-timeframe-platform", "figure"),
     Output("stat-card-unique-users-platform", "children"),
     Output("stat-card-statements-platform", "children"),
     Output("stat-card-objects-platform", "children")],
    [Input("platform-dropdown", "value")
    ]
)
def update_graphs_platform(selected_platform):
    # Filter data based on selected platform
    filtered_data_platform = df[df['platform'].isin(selected_platform)]
    unique_users_platform_count = len(filtered_data_platform['actor_mbox'].unique())
    statements_platform_count = len(filtered_data_platform['statement_ID'].unique())
    objects_platform_count = len(filtered_data_platform['obj_def_name_en'].unique())

    
    # Graphs per Platform
    distribution_of_verbs_platform_fig = px.bar(filtered_data_platform, x=filtered_data_platform['verb_display_en'].value_counts().index,
                      y=filtered_data_platform['verb_display_en'].value_counts().values,
                      labels={'x': 'Verb', 'y': 'Count'},
                      title='Distribution of Verbs')

    # Daily statements per day
    daily_counts = filtered_data_platform.groupby('date').size().reset_index(name='count')

    statements_per_day_platform_fig = go.Figure()

    statements_per_day_platform_fig.add_trace(go.Scatter(x=daily_counts['date'], y=daily_counts['count'], mode='lines+markers', name='Time Series',
                             line=dict(dash='dashdot')))
    
    statements_per_day_platform_fig.update_layout(title='Statements per Day',
                      xaxis_title='Date',
                      yaxis_title='Count')
    # Update the layout to add the range slider
    statements_per_day_platform_fig.update_layout(
        xaxis=dict(
            rangeslider=dict(
                visible=True
            ),
            type="date"
        )
    )

    distribution_per_activity_platform_fig = px.pie(filtered_data_platform , names='obj_type', title='Distribution per Activity')
    languages_platform_fig = px.pie(filtered_data_platform , names='language', title='Language Distribution')
    statements_per_timeframe_platform_fig = px.pie(filtered_data_platform , names='timeframe', title='Statements per Timeframe')

    stat_card_unique_users_platform = dbc.Card([
        dbc.CardBody([
            html.H4("Total Users", className="card-title"),
            html.H2(str(unique_users_platform_count), className="card-subtitle"),
        ])
    ], color="success", inverse=True)

    stat_card_statements_platform = dbc.Card([
        dbc.CardBody([
            html.H4("Total Statements", className="card-title"),
            html.H2(str(statements_platform_count), className="card-subtitle"),
        ])
    ], color="warning", inverse=True)

    stat_card_objects_platform = dbc.Card([
        dbc.CardBody([
            html.H4("Total Objects", className="card-title"),
            html.H2(str(objects_platform_count), className="card-subtitle"),
        ])
    ], color="danger", inverse=True)
    
    return distribution_of_verbs_platform_fig, statements_per_day_platform_fig, distribution_per_activity_platform_fig, languages_platform_fig, statements_per_timeframe_platform_fig,  stat_card_unique_users_platform, stat_card_statements_platform, stat_card_objects_platform

# Callbacks to update the object plots
@app.callback(
    [Output("distribution-of-verbs-object", "figure"),
     Output("statements-per-day-object", "figure"),
     Output("distribution-per-activity-object", "figure"),
     Output("languages-object", "figure"),
     Output("statements-per-timeframe-object", "figure"),
     Output("stat-card-unique-users-object", "children"),
     Output("stat-card-statements-object", "children")],
    [Input("object-dropdown", "value")
    ]
)
def update_graphs_object(selected_object):
    # Filter data based on selected platform
    filtered_data_object = df[df['obj_def_name_en'].isin(selected_object)]
    unique_users_object_count = len(filtered_data_object['actor_mbox'].unique())
    statements_object_count = len(filtered_data_object['statement_ID'].unique())
    
    # Graphs per Platform
    distribution_of_verbs_object_fig = px.bar(filtered_data_object, x=filtered_data_object['verb_display_en'].value_counts().index,
                      y=filtered_data_object['verb_display_en'].value_counts().values,
                      labels={'x': 'Verb', 'y': 'Count'},
                      title='Distribution of Verbs')

    # Daily statements per day
    daily_counts = filtered_data_object.groupby('date').size().reset_index(name='count')

    statements_per_day_object_fig = go.Figure()

    statements_per_day_object_fig.add_trace(go.Scatter(x=daily_counts['date'], y=daily_counts['count'], mode='lines+markers', name='Time Series',
                             line=dict(dash='dashdot')))
    
    statements_per_day_object_fig.update_layout(title='Statements per Day',
                      xaxis_title='Date',
                      yaxis_title='Count')
    # Update the layout to add the range slider
    statements_per_day_object_fig.update_layout(
        xaxis=dict(
            rangeslider=dict(
                visible=True
            ),
            type="date"
        )
    )
    
    distribution_per_activity_object_fig = px.pie(filtered_data_object , names='obj_type', title='Distribution per Activity')
    languages_object_fig = px.pie(filtered_data_object , names='language', title='Language Distribution')
    statements_per_timeframe_object_fig = px.pie(filtered_data_object , names='timeframe', title='Statements per Timeframe')

    stat_card_unique_users_object = dbc.Card([
        dbc.CardBody([
            html.H4("Total Users", className="card-title"),
            html.H2(str(unique_users_object_count), className="card-subtitle"),
        ])
    ], color="success", inverse=True)

    stat_card_statements_object = dbc.Card([
        dbc.CardBody([
            html.H4("Total Statements", className="card-title"),
            html.H2(str(statements_object_count), className="card-subtitle"),
        ])
    ], color="warning", inverse=True)
    
    return distribution_of_verbs_object_fig, statements_per_day_object_fig, distribution_per_activity_object_fig, languages_object_fig, statements_per_timeframe_object_fig, stat_card_unique_users_object, stat_card_statements_object

# Callbacks to update the user plots
@app.callback(
    [Output("distribution-of-verbs-user", "figure"),
     Output("statements-per-day-user", "figure"),
     Output("verbs-over-time-user", "figure"),
     Output("time-spent-user", "figure"),
     Output("distribution-per-activity-user", "figure"),
     Output("languages-user", "figure"),
     Output("statements-per-timeframe-user", "figure"),
     Output("statements-per-platform-user", "figure"),
     Output("stat-card-instances-user", "children"),
     Output("stat-card-statements-user", "children"),
     Output("stat-card-objects-user", "children"),
     Output("user-data-table", "children")
    ],
    [Input("user-dropdown", "value")
    ]
)
def update_graphs_user(selected_user):
    # Filter data based on selected learner
    filtered_data_user = df[df['actor_mbox'].isin(selected_user)]
    instances_user_count = len(filtered_data_user['platform'].unique())
    statements_user_count = len(filtered_data_user['statement_ID'].unique())
    objects_user_count = len(filtered_data_user['obj_def_name_en'].unique())

    # Group data by date and verb to get the count of each verb for each date
    verb_counts = filtered_data_user.groupby(['date', 'verb_display_en']).size().reset_index(name='count')

    # threshold for calculating time spent based on the statements
    threshold_minutes=30

    # Calculate time difference between consecutive rows
    filtered_data_user['time_diff'] = filtered_data_user['timestamp'].diff()
    
    # Identify where a new session starts (where time_diff is NaN or greater than threshold)
    filtered_data_user['new_session'] = (filtered_data_user['time_diff'].isnull()) | (filtered_data_user['time_diff'] > pd.Timedelta(minutes=threshold_minutes))
    
    # Assign session numbers
    filtered_data_user['session_id'] = filtered_data_user['new_session'].cumsum()
    
    # Group by day, platform, and session ID to calculate the duration of each session
    sessions = filtered_data_user.groupby([filtered_data_user['timestamp'].dt.date, 'platform', 'session_id']).agg(start_time=('timestamp', 'min'), end_time=('timestamp', 'max'))
    sessions['duration_minutes'] = (sessions['end_time'] - sessions['start_time']).dt.total_seconds() / 60
    
    # Drop unnecessary columns
    sessions = sessions.drop(columns=['start_time', 'end_time'])
    
    # Reset index for better structure
    sessions_reset = sessions.reset_index()

    # Filter data based on selected email and group by object name
    user_grouped_data = filtered_data_user[filtered_data_user['actor_mbox'].isin(selected_user)].groupby('obj_def_name_en')

    # Calculate the number of statements per object and identify the platform for each group
    user_table_data = user_grouped_data.agg(
        num_statements=pd.NamedAgg(column='statement_ID', aggfunc='size'),
        platform=pd.NamedAgg(column='platform', aggfunc='first')
    ).reset_index()
    
    # Graphs per Platform
    distribution_of_verbs_user_fig = px.bar(filtered_data_user, x=filtered_data_user['verb_display_en'].value_counts().index,
                      y=filtered_data_user['verb_display_en'].value_counts().values,
                      labels={'x': 'Verb', 'y': 'Count'},
                      title='Distribution of Verbs')

    # Daily statements per day
    daily_counts = filtered_data_user.groupby('date').size().reset_index(name='count')

    statements_per_day_user_fig = go.Figure()

    statements_per_day_user_fig.add_trace(go.Scatter(x=daily_counts['date'], y=daily_counts['count'], mode='lines+markers', name='Time Series',
                             line=dict(dash='dashdot')))
    
    statements_per_day_user_fig.update_layout(title='Statements per Day',
                      xaxis_title='Date',
                      yaxis_title='Count')
    # Update the layout to add the range slider
    statements_per_day_user_fig.update_layout(
        xaxis=dict(
            rangeslider=dict(
                visible=True
            ),
            type="date"
        )
    )
  
    distribution_per_activity_user_fig = px.pie(filtered_data_user , names='obj_type', title='Distribution per Activity')
    languages_user_fig = px.pie(filtered_data_user , names='language', title='Language Distribution')
    statements_per_timeframe_user_fig = px.pie(filtered_data_user , names='timeframe', title='Statements per Timeframe')
    statements_per_platform_user_fig = px.pie(filtered_data_user , names='platform', title='Statements per Platform')

    stat_card_instances_user = dbc.Card([
        dbc.CardBody([
            html.H4("Instances Registered", className="card-title"),
            html.H2(str(instances_user_count), className="card-subtitle"),
        ])
    ], color="info", inverse=True)

    stat_card_statements_user = dbc.Card([
        dbc.CardBody([
            html.H4("Total Statements", className="card-title"),
            html.H2(str(statements_user_count), className="card-subtitle"),
        ])
    ], color="warning", inverse=True)

    stat_card_objects_user = dbc.Card([
        dbc.CardBody([
            html.H4("Total Objects", className="card-title"),
            html.H2(str(objects_user_count), className="card-subtitle"),
        ])
    ], color="danger", inverse=True)

    # Create an interactive table to display the results
    user_data_table = dash_table.DataTable(
        columns=[
            {"name": "Object Name", "id": "obj_def_name_en"},
            {"name": "Number of Statements", "id": "num_statements"},
            {"name": "Platform", "id": "platform"},
        ],
        data=user_table_data.to_dict('records'),
        filter_action="native",
        sort_action="native",
        page_action="native",
        page_size=10,
        style_table={'overflowX': 'auto'},
        style_cell={
            'textAlign': 'center',
            'whiteSpace': 'normal',
            'height': 'auto',
        },
        style_data_conditional=[
            {
                'if': {'row_index': 'odd'},
                'backgroundColor': 'rgb(248, 248, 248)'
            }
        ],
        export_format="csv",
        export_headers="display",
    )

    # Summarize data to get total time spent per day per platform
    daily_time_spent = sessions_reset.groupby(['timestamp', 'platform'])['duration_minutes'].sum().reset_index()
        
    # Create the figure
    time_spent_user_fig = go.Figure()
        
    # For each platform, add a trace to the figure
    for platform in daily_time_spent['platform'].unique():
        platform_data = daily_time_spent[daily_time_spent['platform'] == platform]
        time_spent_user_fig.add_trace(go.Scatter(x=platform_data['timestamp'], y=platform_data['duration_minutes'], 
                                    mode='lines+markers', name=platform,
                                    line=dict(dash='dashdot')))
        
    # Update the layout
    time_spent_user_fig.update_layout(title='Time Spent per Day per Platform (based on xAPI statements with 30 minutes interval)',
                          xaxis_title='Date',
                          yaxis_title='Minutes Spent')
        
    # Add the range slider to the x-axis
    time_spent_user_fig.update_layout(
        xaxis=dict(
            rangeslider=dict(
                visible=True
            ),
            type="date"
        )
    )

    # Create the figure
    verbs_over_time_user_fig = go.Figure()
    
    # For each verb, add a trace to the figure
    for verb in verb_counts['verb_display_en'].unique():
        verb_data = verb_counts[verb_counts['verb_display_en'] == verb]
        verbs_over_time_user_fig.add_trace(go.Scatter(x=verb_data['date'], y=verb_data['count'], 
                                 mode='lines+markers', name=verb,
                                 line=dict(dash='dashdot')))
    
    # Update the layout
    verbs_over_time_user_fig.update_layout(title='Verb Counts Over Time',
                      xaxis_title='Date',
                      yaxis_title='Count of Verbs')
    
    # Add the range slider to the x-axis
    verbs_over_time_user_fig.update_layout(
        xaxis=dict(
            rangeslider=dict(
                visible=True
            ),
            type="date"
        )
    )

    return distribution_of_verbs_user_fig, statements_per_day_user_fig, verbs_over_time_user_fig, time_spent_user_fig, distribution_per_activity_user_fig, languages_user_fig, statements_per_timeframe_user_fig, statements_per_platform_user_fig, stat_card_instances_user, stat_card_statements_user, stat_card_objects_user, user_data_table

if __name__ == "__main__":
    app.run_server(port=8050, debug=True, use_reloader=False)