In [1]:
import os
import uuid
import time 
import datetime
import pandas as pd
import numpy as np
from spmf import Spmf
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import random
from graphviz import Digraph
import networkx as nx
from igraph import Graph, EdgeSeq
from collections import defaultdict
from networkx.drawing.nx_agraph import graphviz_layout

Matplotlib is building the font cache; this may take a moment.


# Pré-processamento

In [None]:
dir = r'caminho para o local dos dados'
def list_files(dir):
    r = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            filepath = root + os.sep + name
            if filepath.endswith(".csv") and "turma a ser escolhido, por exemplo: 2023-01-Programacao" in filepath:
                r.append(os.path.join(root, name))
            #end if
        #end for
    #end for
    return sorted(r)
#end list_files

In [None]:
df_list = list_files(dir)

In [None]:
df_list

In [None]:
df = pd.read_csv(df_list[1], sep=',')

In [None]:
df.head()

In [None]:
def modulos_id(modulos):
    lista = list(modulos)

    for i in range(len(lista)):
        phrase = lista[i]
        if phrase.find("VPL") == -1:
            
            id_number = phrase.count('id')
            final_number = phrase.count("'")
            
            part1 = phrase.split("id '", id_number)
            part2 = phrase.split("'", final_number)
            begin = len(phrase) - len(part1[-1])
            final = len(phrase) - len(part2[-1]) - len("'")
            lista[i] = phrase[begin:final]
        else:
            id_number = phrase.count('id')
            final_number = phrase.count(" ")
            
            part1 = phrase.split("id ", id_number)

            lista[i] = part1[-1]

    return lista

In [None]:
def process_logs(df_name, weeks_to_keep, truncate_to_week):
    df = pd.read_csv(df_name, sep=",")
    df["date"] = pd.to_datetime(df['Hora'].str.zfill(16), dayfirst=True) #format='%d/%m/%y %H:%M')
    #df["date"] = df['date'] - pd.to_timedelta(df['date'].dt.dayofweek, unit='d') if truncate_to_week else df["date"]
    #eliminate hours, minutes and seconds
    df["date"] = df["date"].dt.date

    df["key"] = df["Nome completo"]
    #print("Unique users found: {}".format(df.key.nunique()))

    df["date"] = pd.to_datetime(df['date'])
    #get first date and add the offset
    thre = df.date.min() + datetime.timedelta(weeks = weeks_to_keep)
    #filter out weeks out of range
    df = df[(df['date'] < thre)]

    df['Inicio curso'] = df.date.min()
    df['Semana'] = ( (df.date - df.date.min()).dt.days // 7 ) + 1

    df['id do modulo'] = df['Descrição']


    df = df[["date", "key", "Contexto do Evento", "Nome do evento", "Semana", "id do modulo"]]

    return df
#end process_logs

In [None]:
def process_grades(df_name):
    df = pd.read_csv(df_name, sep=",")
    
    #df["Matrícula"] =df["Matrícula"].astype(int)
    df.info()
    df["key"] = df["Nome"] + " " + df["Sobrenome"].astype(str)
    df["target"] = df["Total do curso (Real)"].replace("-", 0.0).astype(float)
    df["target_cat"] = np.where(df['target'] >= 6.0 , 'Aprovado', 'Reprovado')
    return df[["key", "target", "target_cat"]]
#end process_grades

In [None]:
def process_dfs(df_list, weeks_to_keep=16, truncate_to_week=False):
    to_concat = []
    for i in range(0, len(df_list), 2):
        log_first = True if "logs" in df_list[i] else False
        turma = df_list[i].split(os.sep)[1]
        if log_first:
            df_logs = process_logs(df_list[i], weeks_to_keep, truncate_to_week)
            df_grades = process_grades(df_list[i+1])
        else:
            df_grades = process_grades(df_list[i])
            df_logs = process_logs(df_list[i+1], weeks_to_keep, truncate_to_week)
        #end if
        data_cols = df_logs.columns.tolist()
        data_cols.remove("key")
        
        print("Logs size = {}".format(len(df_logs)))
        print("Grades size = {}".format(len(df_grades)))
        #print('Logs INFO')

        #print(df_logs['key'].head(49))
            
        print('Grades INFO')
        print(df_grades['key'].head(16))
        final_df = df_logs.merge(df_grades, on="key", how="inner")
        print('DF_Final INFO')
        print(final_df.head())
        final_df["group"] = turma

        final_df.loc[:, "id_group"] = 1
        final_df.loc[:, "id_subject"] = 1

        #anonymize groups and subjects
        final_df.loc[:, "id_group"] = final_df.groupby("group").id_group.transform(lambda g: str(uuid.uuid4()))
        final_df.loc[:, "id_subject"] = final_df.groupby("key").id_subject.transform(lambda g: str(uuid.uuid4()))

        to_keep = ["id_group", "id_subject","key"] + data_cols + ["target", "target_cat"]
        to_concat.append( final_df[to_keep] )
    #end for
    return pd.concat(to_concat)
#end anonymize_dfs

In [None]:
df_final = process_dfs(df_list, weeks_to_keep=16)

# Remoção das atividades

In [None]:
modulos_para_remover = ['Laboratório Virtual de Programação', 'Curso', 'Tarefa', 'Exercícios', 'Questionário', 'Enquete']
df_final.drop(df_final[df_final['Contexto do Evento'].str.contains('|'.join(modulos_para_remover))].index, inplace=True)

In [None]:
df_final.head()

In [None]:
df_final.insert(8, "Modulo", 0, True)
df_final.reset_index(drop=True, inplace=True)

In [None]:
df_final.head()

#### Dicionário de Módulos

In [None]:
Modulos_acessados = df_final['Contexto do Evento'].unique()
Modulos_acessados

In [None]:
dicionario_modulos = {}
for i in range(0, len(Modulos_acessados)):
    dicionario_modulos[i] = Modulos_acessados[i]

print(dicionario_modulos)

In [None]:
for index, value in enumerate(df_final['Contexto do Evento']):
    for key, modulo in dicionario_modulos.items():
        if value == modulo:
            df_final.loc[index, 'Modulo'] = key
df_final['Modulo'] =df_final['Modulo'].astype(int)
df_final.head()

In [None]:
df_final.info()

# Verificar Ocorrência das atividades

In [None]:
Qtn_acessos = df_final.groupby(['Contexto do Evento'], as_index=False).agg({"key": 'count'}).sort_values(by='key', ascending=False)

fig = go.Figure(data = [go.Table(
    columnwidth = [400, 100],
    header=dict(values=['Ação', 'Quantidade de Acessos'],
                line_color='darkslategray',
                align=['left', 'center'],
                fill_color ='royalblue',
                font=dict(color='white', size=12),
                height=40),
                cells=dict(values=[Qtn_acessos['Contexto do Evento'], Qtn_acessos['key']],
                line_color='darkslategray',
                fill=dict(color=['paleturquoise', 'lightgray']),
                align=['left', 'center'],
                font_size = 12,
                height = 30
                ))])

fig.show()

# Pré - processamento para aplicar o algoritmo

In [None]:
# Pré - processamento para aplicar o algoritmo
colunasSelecionadas = ['key', 'date', 'Modulo', 'target_cat']
eventos = df_final.filter(items=colunasSelecionadas)
key_to_number = {key: number for number, key in enumerate(eventos['key'].unique())}
eventos['key'] = eventos['key'].map(key_to_number)
num_students = len(eventos['key'].unique())
num_students

In [None]:
eventos.head()

In [None]:
eventos['date'] = pd.to_datetime(eventos['date'])
eventos = eventos.sort_values(by='date')
eventos['gap'] = eventos.groupby('key')['date'].diff()
restricao = pd.Timedelta(minutes=30)
eventos['inicio'] = (eventos['gap'] > restricao) | (eventos['gap'].isnull())
eventos['sequencia'] = eventos['inicio'].cumsum()
eventos = eventos.drop(['gap', 'inicio'], axis = 1)
eventos = eventos.reset_index(drop=True)
eventos.head(50)

#### Análise dos Dados

In [None]:
df_final = df_final.dropna()

df_final['evento_simplificado'] = ''
df_final.loc[df_final.evento_simplificado == '', 'evento_simplificado'] = df_final['Contexto do Evento'].str.split().str.get(0).apply(lambda x: x[:-1] if x.endswith(":") else x)

df_final.head()

In [None]:
df_final.info()

In [None]:
count_df = df_final.groupby(['evento_simplificado', 'target_cat']).size().unstack(fill_value=0)

fig = go.Figure()

for modulos_count in count_df.columns:
    fig.add_trace(go.Bar(x=count_df.index, y=count_df[modulos_count ], name=modulos_count))

fig.update_layout(title='Proporção de Clique no Ambiente de Ensino Alunos Aprovados e Reprovados',
                  xaxis_title='Módulos',
                  yaxis_title='Cliques',
                  barmode='group')
fig.show()

In [None]:
count_df

In [None]:
eventos.info()

In [None]:
max = eventos['sequencia'].max()
aux = 1
lista_elementos = []
sequencias_eventos = []
old_target = ''
old_key = 0
for index, row in eventos.iterrows():
    if(int(row['sequencia']) == aux):
        lista_elementos.append(row['Modulo'])
        old_target = row['target_cat']
        old_key = row['key']
    if(row['sequencia'] != aux):
        lista_elementos.append(old_key)
        lista_elementos.append(old_target)
        aux += 1
        sequencias_eventos.append(lista_elementos)
        lista_elementos = []
        lista_elementos.append(row['Modulo'])
    
sequencias_eventos

#### Separando as sequências

In [None]:
sequencia_aprovados = [[] for _ in range(num_students)]
sequencia_reprovados = [[] for _ in range(num_students)]

for sequencia in sequencias_eventos:
    if 'Aprovado' in sequencia:                                                                                                                                                                 
        del sequencia[-1]
        aux = int(sequencia[-1])
        del sequencia[-1]
        sequencia_aprovados[aux].append(sequencia)

    else:
        del sequencia[-1]
        aux = int(sequencia[-1])
        del sequencia[-1]
        sequencia_reprovados[aux].append(sequencia)

sequencia_aprovados

In [None]:
def remover_vazios(lista):
    vazio = []
    for i, j in enumerate(lista):
        if not j:
            vazio.append(i)
    for i in reversed(vazio):
        lista.pop(i)



In [None]:
remover_vazios(sequencia_aprovados)
len(sequencia_aprovados)

In [None]:

remover_vazios(sequencia_reprovados)
len(sequencia_reprovados)

In [None]:
sequencia_reprovados

In [None]:
sequencia_aprovados

### Teste Aprovados

In [None]:
spmf = Spmf("PrefixSpan", input_direct=sequencia_aprovados, output_filename="resultadoAprovados.txt", arguments=[0.4, 5, True])
spmf.run()
df_aprovados = spmf.to_pandas_dataframe()
df_aprovados

In [None]:
def get_support(df, pattern):
    pattern_str = ' '.join(pattern)
    match = df[df['pattern'].apply(lambda x: ' '.join(x) == pattern_str)]
    if not match.empty:
        return match['sup'].values[0]
    return 0

rules = []
for _, row in df_aprovados.iterrows():
    pattern = row['pattern']
    support_AB = row['sup']
    for i in range(1, len(pattern)):
        A = pattern[:i]
        B = pattern[i:]
        support_A = get_support(df_aprovados, A)
        support_B = get_support(df_aprovados, B)
        
        confidence = support_AB / support_A if support_A > 0 else 0
        lift = support_AB / (support_A * support_B) if support_A > 0 and support_B > 0 else 0
        
        rules.append({
            'rule': (A, B),
            'support': support_AB,
            'confidence': confidence,
            'lift': lift
        })

# Step 5: Display the rules with their metrics
for rule in rules:
    if rule['lift'] > 1:
        print(f"Rule: {rule['rule']}, Support: {rule['support']:.2f}, Confidence: {rule['confidence']:.2f}, Lift: {rule['lift']:.2f}")

In [None]:
for rule in rules:    
    print(f"Rule: {rule['rule']}, Support: {rule['support']:.2f}, Confidence: {rule['confidence']:.2f}, Lift: {rule['lift']:.2f}")

In [None]:
resultado = r"/root/SPM2/Prefixspam_Somente_Aulas/2023-01-Programacao/resultadoAprovados.txt"

mapeamento_de_modulos_aprovados = {}
legendas = []
with open(resultado, 'r') as file:
    for linha in file:
        if not linha.strip():
            continue

        split = linha.split(" ")
        primeiro_item_itemset = True
        primeiro_item_set = True
        item_anterior_do_mesmo_itemset = None

        for item in split:
            if item == "-1":  
                pass
            elif item == "-2" or item.startswith('#'):  
                item_anterior_do_mesmo_itemset = None
                break
            else: 
                if item_anterior_do_mesmo_itemset is not None:
                    mapeamento_de_modulos_aprovados[item_anterior_do_mesmo_itemset] = item
                item_anterior_do_mesmo_itemset = item

reversed_dict = defaultdict(list)

for key, value in mapeamento_de_modulos_aprovados.items():
    reversed_dict[value].append(key)

print(mapeamento_de_modulos_aprovados)
print(reversed_dict)

In [None]:
mapeamento_de_modulos = arvore_dict = reversed_dict

G = nx.DiGraph()

for edge_key, edge_value in mapeamento_de_modulos.items():
    for edge in edge_value:
        G.add_edge(edge_key, edge)

pos = graphviz_layout(G, prog='dot')

fig = go.Figure()

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    radius = 25  
    dx = x1 - x0
    dy = y1 - y0
    length = (dx**2 + dy**2)**0.5
    offset = radius / (length + 0.01)  
    x_edge = x0 + (dx - offset * dx)
    y_edge = y0 + (dy - offset * dy)
    
    fig.add_trace(go.Scatter(
        x=[x0, x_edge, x1],
        y=[y0, y_edge, y1],
        mode='lines',
        line=dict(color='rgb(210,210,210)', width=1),
        hoverinfo='none',
        showlegend=False  
    ))

# Add nodes with numbers
for node in G.nodes():
    x, y = pos[node]
    value = node + ': '+ dicionario_modulos[int(node)]
    print(value) 
    fig.add_trace(go.Scatter(
        x=[x],
        y=[y],
        mode='markers+text',  
        name=value,
        marker=dict(
            symbol='circle-dot',
            size=25,
            color='#6175c1',
            line=dict(color='rgb(50,50,50)', width=1)
        ),
        text=[node],  
        hoverinfo='name',
        textposition='middle center', 
        opacity=0.8,
        textfont=dict(color='white', size=12)  
    ))

axis = dict(showline=False, 
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            )

fig.update_layout(title='Sequência de Módulos Acessados por Alunos Aprovados',
                font_size=12,
                showlegend=True,
                xaxis=axis,
                yaxis=axis,
                margin=dict(l=40, r=40, b=85, t=100),
                hovermode='closest',
                plot_bgcolor='rgb(248,248,248)',
                hoverlabel=dict(
                bgcolor="white",
                font_size=16,
                font_family="Arial",
                namelength=-1  )
                )


fig.update_traces(textposition='middle center')

fig.show()


### Teste Reprovados

In [None]:
spmf = Spmf("PrefixSpan", input_direct=sequencia_reprovados, output_filename="resultadoReprovados.txt", arguments=[0.4, 5, True])
spmf.run()

df_reprovados = spmf.to_pandas_dataframe()

In [None]:
def get_support(df, pattern):
    pattern_str = ' '.join(pattern)
    match = df[df['pattern'].apply(lambda x: ' '.join(x) == pattern_str)]
    if not match.empty:
        return match['sup'].values[0]
    return 0

rules = []
for _, row in df_reprovados.iterrows():
    pattern = row['pattern']
    support_AB = row['sup']
    for i in range(1, len(pattern)):
        A = pattern[:i]
        B = pattern[i:]
        support_A = get_support(df_reprovados, A)
        support_B = get_support(df_reprovados, B)
        
        confidence = support_AB / support_A if support_A > 0 else 0
        lift = support_AB / (support_A * support_B) if support_A > 0 and support_B > 0 else 0
        
        rules.append({
            'rule': (A, B),
            'support': support_AB,
            'confidence': confidence,
            'lift': lift
        })

# Step 5: Display the rules with their metrics
for rule in rules:
    if rule['lift'] > 1:
        print(f"Rule: {rule['rule']}, Support: {rule['support']:.2f}, Confidence: {rule['confidence']:.2f}, Lift: {rule['lift']:.2f}")

In [None]:
for rule in rules:
    print(f"Rule: {rule['rule']}, Support: {rule['support']:.2f}, Lift: {rule['lift']:.2f}")

In [None]:
resultado = r"/root/SPM2/Prefixspam_Somente_Aulas/2023-01-Programacao/resultadoReprovados.txt"

mapeamento_de_modulos_reprovados = {}

with open(resultado, 'r') as file:
    for linha in file:
        if not linha.strip():
            continue

        split = linha.split(" ")
        primeiro_item_itemset = True
        primeiro_item_set = True
        item_anterior_do_mesmo_itemset = None

        for item in split:
            if item == "-1":  
                pass
            elif item == "-2" or item.startswith('#'):  
                item_anterior_do_mesmo_itemset = None
                break
            else: 
                if item_anterior_do_mesmo_itemset is not None:
                    mapeamento_de_modulos_reprovados[item_anterior_do_mesmo_itemset] = item
                item_anterior_do_mesmo_itemset = item

reversed_dict = defaultdict(list)

for key, value in mapeamento_de_modulos_reprovados.items():
    reversed_dict[value].append(key)

print(mapeamento_de_modulos_reprovados)
print(reversed_dict)

In [None]:
mapeamento_de_modulos = arvore_dict = reversed_dict

G = nx.DiGraph()

for edge_key, edge_value in mapeamento_de_modulos.items():
    for edge in edge_value:
        G.add_edge(edge_key, edge)

pos = graphviz_layout(G, prog='dot')

fig = go.Figure()

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    radius = 25  
    dx = x1 - x0
    dy = y1 - y0
    length = (dx**2 + dy**2)**0.5
    offset = radius / (length + 0.01)  
    x_edge = x0 + (dx - offset * dx)
    y_edge = y0 + (dy - offset * dy)
    
    fig.add_trace(go.Scatter(
        x=[x0, x_edge, x1],
        y=[y0, y_edge, y1],
        mode='lines',
        line=dict(color='rgb(210,210,210)', width=1),
        hoverinfo='none',
        showlegend=False  
    ))

# Add nodes with numbers
for node in G.nodes():
    x, y = pos[node]
    value = node + ': '+ dicionario_modulos[int(node)]
    print(value) 
    fig.add_trace(go.Scatter(
        x=[x],
        y=[y],
        mode='markers+text',  
        name=value,
        marker=dict(
            symbol='circle-dot',
            size=25,
            color='#6175c1',
            line=dict(color='rgb(50,50,50)', width=1)
        ),
        text=[node],  
        hoverinfo='name',
        textposition='middle center', 
        opacity=0.8,
        textfont=dict(color='white', size=12)  
    ))

axis = dict(showline=False, 
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            )

fig.update_layout(title='Sequência de Módulos Acessados por Alunos Reprovados',
                font_size=12,
                showlegend=True,
                xaxis=axis,
                yaxis=axis,
                margin=dict(l=40, r=40, b=85, t=100),
                hovermode='closest',
                plot_bgcolor='rgb(248,248,248)',
                hoverlabel=dict(
                bgcolor="white",
                font_size=16,
                font_family="Arial",
                namelength=-1)
                  )


fig.update_traces(textposition='middle center')

fig.show()
