# UMBC Academic Graph — Synthetic Data EDA (from local generator)

This notebook walks **cell-by-cell** through:
1. Importing the local generator `generate_synthetic_data.py`
2. Generating data (dict **and** CSVs)
3. Loading CSVs into pandas
4. Basic analysis / quick visuals (students, courses, grades)
5. A tiny NetworkX projection (courses + prerequisites) for graph stats

Run the cells in order. Charts use plain **matplotlib**.

In [None]:
# --- Setup (run once) ---
%pip install -q pandas numpy matplotlib networkx
import os, sys, json, importlib.util, pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
plt.rcParams['figure.figsize'] = (9,6)
pd.set_option('display.max_columns', 120)

# Point to the generator script (adjust if you moved it)
GEN_PATH = pathlib.Path('../generate_synthetic_data.py')
assert GEN_PATH.exists(), f"Generator not found at {GEN_PATH}"

spec = importlib.util.spec_from_file_location("synthgen", str(GEN_PATH))
synthgen = importlib.util.module_from_spec(spec)
spec.loader.exec_module(synthgen)
print("Loaded:", synthgen.__name__)

## 1) Generate synthetic data
We call the generator's functions to build the in-memory dataset, then export CSVs for easy loading/EDA.

In [None]:
# Build everything using the script's functions
terms = synthgen.generate_terms()
students = synthgen.generate_students()
faculty, faculty_by_dept = synthgen.generate_faculty()
courses, dept_courses = synthgen.generate_courses(faculty_by_dept)
degrees, requirement_groups = synthgen.generate_degrees(dept_courses)
prereqs = synthgen.generate_prerequisites(courses)
leads_to = synthgen.generate_leads_to_relationships(courses, prereqs)
sim_content, sim_difficulty = synthgen.generate_course_similarity(courses)
student_degree = synthgen.generate_student_degree_relationships(students, degrees)
teaching = synthgen.generate_teaching_relationships(faculty_by_dept, courses)
completed, enrolled = synthgen.generate_student_course_history(students, courses, terms, prereqs)
textbooks, course_textbooks = synthgen.generate_textbooks(courses)
interactions, page_views = synthgen.generate_textbook_interactions(
    students, courses, textbooks, course_textbooks, terms, completed
)
learning_style_sim, performance_sim = synthgen.generate_student_similarity(students, completed)

# IMPORTANT: use the exact keys export_to_csv expects
data = {
    "terms": terms,
    "students": students,
    "faculty": faculty,
    "courses": courses,
    "degrees": degrees,
    "requirement_groups": requirement_groups,
    "prerequisites": prereqs,
    "leads_to": leads_to,
    "similarity_content": sim_content,
    "similarity_difficulty": sim_difficulty,
    "student_degree": student_degree,
    "teaching": teaching,
    "completed_courses": completed,
    "enrolled_courses": enrolled,
    "textbooks": textbooks,
    "course_textbooks": course_textbooks,
    "textbook_interactions": interactions,  # <-- rename fix
    "page_views": page_views,
    "learning_style_similarity": learning_style_sim,
    "performance_similarity": performance_sim,
}

# (Optional) quick validation so you catch any other mismatches early
required_keys = {
    "terms","students","faculty","courses","degrees","requirement_groups",
    "prerequisites","leads_to","similarity_content","similarity_difficulty",
    "student_degree","teaching","completed_courses","enrolled_courses",
    "textbooks","course_textbooks","textbook_interactions","page_views",
    "learning_style_similarity","performance_similarity"
}
missing = sorted(required_keys - set(data.keys()))
assert not missing, f"Missing keys for export_to_csv: {missing}"


In [None]:
OUT_DIR = "umbc_data"
synthgen.export_to_csv(data, OUT_DIR)
sorted(os.listdir(OUT_DIR))

## 2) Load CSVs to pandas
This mirrors how you might inspect data before pushing into Neo4j or ML feature pipelines.

In [None]:
%pip install pandas
import pandas as pd
import os
OUT_DIR = "umbc_data"

def load_df(name):
    return pd.read_csv(os.path.join(OUT_DIR, name))

students_df = load_df('students.csv')
courses_df = load_df('courses.csv')
degrees_df = load_df('degrees.csv')
terms_df = load_df('terms.csv')
faculty_df = load_df('faculty.csv')
req_df = load_df('requirement_groups.csv')
prereq_df = load_df('prerequisites.csv')
leads_df = load_df('leads_to.csv')
simc_df = load_df('course_similarity_content.csv')
simd_df = load_df('course_similarity_difficulty.csv')
stu_deg_df = load_df('student_degree.csv')
teach_df = load_df('teaching.csv') if os.path.exists(os.path.join(OUT_DIR,'teaching.csv')) else None
completed_df = load_df('completed_courses.csv')
enrolled_df = load_df('enrolled_courses.csv')

print(students_df.shape, courses_df.shape, completed_df.shape)  # sanity check
students_df.head(3)

## 3) Quick distributions & sanity plots
* Students by learning style and instruction preference
* Course level/difficulty/time
* Grade distribution (from `completed_courses.csv`)

In [None]:
# Students by learning style
students_df['learningStyle'].value_counts().plot(kind='bar'); plt.title('Learning Style (count)'); plt.xlabel(''); plt.show()

# Instruction mode preference
students_df['preferredInstructionMode'].value_counts().plot(kind='bar'); plt.title('Preferred Instruction Mode'); plt.xlabel(''); plt.show()

# Course level distribution
courses_df['level:int' if 'level:int' in courses_df.columns else 'level'].plot.hist(bins=[100,200,300,400,500]); plt.title('Course Levels'); plt.show()

# Difficulty vs time commitment
xcol = 'avgDifficulty:float' if 'avgDifficulty:float' in courses_df.columns else 'avgDifficulty'
ycol = 'avgTimeCommitment:int' if 'avgTimeCommitment:int' in courses_df.columns else 'avgTimeCommitment'
courses_df.plot.scatter(x=xcol, y=ycol); plt.title('Difficulty vs Time Commitment'); plt.show()

# Grades distribution
completed_df['grade'].value_counts().plot(kind='bar'); plt.title('Grade Distribution'); plt.xlabel(''); plt.show()

## 4) Simple derived metrics
Map letter grades to GPA, compute per-student GPA, and join with learning style to spot patterns.

In [None]:
grade_map = {"A":4.0,"A-":3.7,"B+":3.3,"B":3.0,"B-":2.7,"C+":2.3,"C":2.0,"C-":1.7,
             "D+":1.3,"D":1.0,"D-":0.7,"F":0.0,"W":0.0}
completed_df['gpa'] = completed_df['grade'].map(grade_map).fillna(0)
stu_gpa = completed_df.groupby('studentId').agg(gpa_mean=('gpa','mean'), n_courses=('courseId','count')).reset_index()
merged = students_df.merge(stu_gpa, left_on='id:ID(Student)' if 'id:ID(Student)' in students_df.columns else 'id', right_on='studentId', how='left')
merged['gpa_mean'] = merged['gpa_mean'].fillna(0)
merged[['learningStyle','gpa_mean']].groupby('learningStyle').mean().plot(kind='bar'); plt.title('Mean GPA by Learning Style'); plt.xlabel(''); plt.show()
merged[['preferredCourseLoad:int' if 'preferredCourseLoad:int' in merged.columns else 'preferredCourseLoad','gpa_mean']].plot.scatter(x=merged.columns[merged.columns.str.contains('preferredCourseLoad')][0], y='gpa_mean'); plt.title('GPA vs Preferred Course Load'); plt.show()

## 5) Tiny course graph (NetworkX)
Use prerequisites to make a directed course graph, compute centralities, and peek at a subgraph plot.

In [None]:
# Build directed graph from prerequisites
G = nx.DiGraph()
cid_col = 'id:ID(Course)' if 'id:ID(Course)' in courses_df.columns else 'id'
G.add_nodes_from(courses_df[cid_col].tolist())
scol = ':START_ID(Course)'
tcol = ':END_ID(Course)'
for _, r in prereq_df.iterrows():
    G.add_edge(r[scol], r[tcol], type='PREREQUISITE_FOR')

print(nx.info(G))

# Centrality (basic)
deg_c = nx.degree_centrality(G)
pr_c = nx.pagerank(G) if G.number_of_edges() > 0 else {n:0 for n in G.nodes}
cen_df = pd.DataFrame({
    'courseId': list(deg_c.keys()),
    'deg': list(deg_c.values()),
    'pr': [pr_c.get(n,0) for n in deg_c.keys()],
}).sort_values('pr', ascending=False)
cen_df.head(10)

In [None]:
# Visualize a small subgraph (top PageRank courses)
top_nodes = cen_df.head(20)['courseId'].tolist()
H = G.subgraph(top_nodes).copy()
pos = nx.spring_layout(H, seed=0)
nx.draw(H, pos, with_labels=False, node_size=120)
plt.title('Top-PR Course Subgraph'); plt.show()

## 6) Course outcomes by difficulty/time
Join course metadata to completions and see how pass rate varies with difficulty/time.

In [None]:
courses_df_columns = courses_df.columns
avgdiff_col = 'avgDifficulty:float' if 'avgDifficulty:float' in courses_df_columns else 'avgDifficulty'
avgtime_col = 'avgTimeCommitment:int' if 'avgTimeCommitment:int' in courses_df_columns else 'avgTimeCommitment'

cid = 'id:ID(Course)' if 'id:ID(Course)' in courses_df_columns else 'id'
tmp = completed_df.merge(courses_df[[cid, avgdiff_col, avgtime_col]], left_on='courseId', right_on=cid, how='left')
tmp['passed'] = (tmp['grade'].map({'A':1,'A-':1,'B+':1,'B':1,'B-':1,'C+':1,'C':1,'C-':1,'D+':1,'D':1,'D-':1}).fillna(0)).astype(int)
agg = tmp.groupby([avgdiff_col, pd.cut(tmp[avgtime_col], bins=[0,5,10,15,20,100])])['passed'].mean().reset_index()
agg_pivot = agg.pivot(index=avgdiff_col, columns=''+str(agg.columns[1]), values='passed')
agg_pivot.fillna(0).plot(kind='bar'); plt.title('Pass Rate by Difficulty and Time Bin'); plt.xlabel('avgDifficulty'); plt.ylabel('Pass rate'); plt.show()

## 7) Where to take this next
- Export this feature table for ML (e.g., pass prediction)
- Build Node2Vec/GraphSAGE embeddings on the course graph and join back
- Add fairness slices (aid status, instruction mode)
- Link prediction sketch using pairs from `enrolled_courses.csv` vs historical completions