In [1]:
#import packages
import pandas as pd
import numpy as np
import random
import re
import string
#import requests
#import plotnine 
#from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

#may need to pip install first
#!pip install netwulf
from netwulf import visualize
import netwulf as nw
import networkx as nx


# Data Prep

## Load in Data

In [2]:
# Define the file path
pre_file_path = '../code/data/Med student pre assessment 2.8.23.csv'
foundational_file_path = '../code/data/Foundational Post Assessment 2.8.23.csv'
intermediate_file_path = '../code/data/intermediate post assessment 2.8.23.csv'
advanced_file_path = '../code/data/Advanced post assessment 2.8.23.csv'

# Load the datasets using pandas
pre_data = pd.read_csv(pre_file_path)
post_data_foundational = pd.read_csv(foundational_file_path,skiprows =lambda x: x in [1,2])
post_data_intermediate = pd.read_csv(intermediate_file_path,skiprows =lambda x: x in [1,2])
post_data_advanced = pd.read_csv(advanced_file_path,skiprows =lambda x: x in [1,2])


## Merge Data

In [3]:
#merge pre to foundational post, where scox = pre score, scoy = post score
joined_data_foundational = pd.merge(post_data_foundational, pre_data, on = "IPAddress", how = "left")
joined_data_intermediate = pd.merge(post_data_intermediate, pre_data, on = "IPAddress", how = "left")
joined_data_advanced = pd.merge(post_data_advanced, pre_data, on = "IPAddress", how = "left")


## Rename columns

In [4]:
#renaming the columns, SC0y = pre, SC0x = post for clarity:
foundational_data = joined_data_foundational.rename(columns={'SC0_y':'Pre', 'SC0_x':'Post'})
#drop missing data from Pre and Post
foundational_data = foundational_data.dropna(subset=['Pre', 'Post'])

#do the same for two other cohorts
intermediate_data = joined_data_intermediate.rename(columns={'SC0_y':'Pre', 'SC0_x':'Post'})
intermediate_data = intermediate_data.dropna(subset=['Pre', 'Post'])

advanced_data = joined_data_advanced.rename(columns={'SC0_y':'Pre', 'SC0_x':'Post'})
advanced_data = advanced_data.dropna(subset=['Pre', 'Post'])

## Data prep for Network Visualization:

In [5]:
#add a column called advanced for advanced_df
post_data_advanced['advanced'] = 1
#add a column called intermediate for intermediate_df
post_data_intermediate['intermediate'] = 1
#add a column called foundational for foundational_df
post_data_foundational['foundational'] = 1

In [6]:
#combine all three dataframes
combined_df = pd.concat([post_data_advanced, post_data_intermediate, post_data_foundational], ignore_index=True)

#fill NaN values in the 'advanced', 'intermediate', and 'foundational' columns with 0
combined_df['advanced'] = combined_df['advanced'].fillna(0)
combined_df['intermediate'] = combined_df['intermediate'].fillna(0)
combined_df['foundational'] = combined_df['foundational'].fillna(0)

In [7]:
#assign quartiles to the sc0
combined_df['quartile'] = pd.qcut(combined_df['SC0'], 4, labels=False)

#define the conditions
conditions = [
    combined_df['advanced'] == 1,
    combined_df['intermediate'] == 1,
    combined_df['foundational'] == 1
]

#define the choices corresponding to each condition
choices = ['advanced', 'intermediate', 'foundational']

#create the 'cohort' column using np.select to apply these conditions and choices
combined_df['cohort'] = np.select(conditions, choices, default='unknown')


## Code for Network Visualization

In [8]:
# Creating the graph
G = nx.Graph()
for idx, row in combined_df.iterrows():
    G.add_node(str(idx) + "Q" + str(row['quartile']), quartile=row['quartile'], cohort=row['cohort'])
    for idx1, data1 in G.nodes(data=True):
        for idx2, data2 in G.nodes(data=True):
            if idx1 != idx2 and data1['quartile'] == data2['quartile']:
                G.add_edge(idx1, idx2)

# Assign node colors based on cohort
color_map = {
    'advanced': '#8ac926',
    'intermediate': '#ffca3a',
    'foundational': '#ff595e'
}
nx.set_node_attributes(G, {idx: color_map[data['cohort']] for idx, data in G.nodes(data=True)}, 'color')

#visualize the network
pos = nx.spring_layout(G, k=0.25, iterations=20)
plt.figure(figsize=(10, 10))
nx.draw_networkx(G, pos, node_color=[data['color'] for _, data in G.nodes(data=True)], with_labels=False, node_size=50, edge_color="grey")

#create a legend for the cohorts
legend_labels = {"Advanced": "#8ac926", "Intermediate": "#ffca3a", "Foundational": "#ff595e"}
for label, color in legend_labels.items():
    plt.scatter([], [], color=color, label=label)

#add labels and titles
plt.title('Network Visualization')
plt.legend(title="Cohort", loc="upper left")
plt.close()


In [9]:
#visualize the network with Netwulf, configuring colors by cohort
stylized_network, config = visualize(G, config={'node_color': 'color'}, port=2000)

In [None]:
#code to save plot
pos = nx.spring_layout(G, k=0.25, iterations=20)
plt.figure(figsize=(10, 10))
nx.draw_networkx(G, pos, node_color=[data['color'] for _, data in G.nodes(data=True)], with_labels=False, node_size=50, edge_color="grey")
plt.title('Network Visualization')
plt.legend(title="Cohort", loc="upper left")
plt.savefig('network_visualization.png')
plt.close()