### 새로운 핵심키워드_v2 기존 데이터와 합치기

In [2]:
import pandas as pd

# 성취기준 데이터 로드
achievement_data = pd.read_excel('merged_final_data.xlsx')  # 성취기준 데이터 파일
keywords_data = pd.read_excel('성취기준.xlsx')  # 핵심키워드_v2 파일

# 성취기준코드를 기준으로 병합
merged_data = pd.merge(achievement_data, keywords_data, on='성취기준코드', how='left')


# 결과 저장
merged_data.to_excel('merged_final_data_new.xlsx', index=False)


### 엑셀파일 => csv 변환

In [7]:
import pandas as pd

# 엑셀 파일 경로
excel_file = 'merged_final_data_new - 복사본.xlsx'  # 엑셀 파일 이름 또는 경로
csv_file = 'merged_final_data_new - 복사본.csv'  # 변환할 CSV 파일 이름

# 엑셀 파일 읽기
df = pd.read_excel(excel_file)

# CSV 파일로 저장
df.to_csv(csv_file, index=False, encoding='utf-8-sig')

print(f"엑셀 파일이 '{csv_file}'로 성공적으로 변환되었습니다!")


엑셀 파일이 'merged_final_data_new - 복사본.csv'로 성공적으로 변환되었습니다!


In [48]:
import pandas as pd
import networkx as nx
from pyvis.network import Network
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:

# 1. 데이터 로드
file_name = "merged_final_data_new - 복사본.csv"  
data = pd.read_csv(file_name)

# 필수 컬럼 확인
required_columns = [
    "f_mchapter_nm", "f_mchapter_id", "성취기준 내용", "성취기준코드", "핵심키워드_v2",
    "성취수준 A", "성취수준 B", "성취수준 C", "f_schapter_id", "f_schapter_nm"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"필수 컬럼 '{col}'이(가) 데이터에 없습니다.")

# 중복된 텍스트 제거
data = data.drop_duplicates(subset=["f_mchapter_nm", "성취기준 내용", "핵심키워드_v2"])

# 2. 데이터 결합 및 전처리
# 텍스트 정제
data["combined_text"] = (
    data["f_mchapter_nm"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True) +
    " " +
    data["성취기준 내용"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True) +
    " " +
    data["핵심키워드_v2"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True)
)
data["tooltip"] = (
    "소단원명: " + data["f_schapter_nm"].fillna("") +
    "<br>성취수준 A: " + data["성취수준 A"].fillna("") +
    "<br>성취수준 B: " + data["성취수준 B"].fillna("") +
    "<br>성취수준 C: " + data["성취수준 C"].fillna("")
)

# 3. 유사도 계산
# Sentence-BERT 유사도
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["combined_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF 유사도
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["combined_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 최종 유사도
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 4. 그래프 생성
threshold = 0.8
graph = nx.DiGraph()

# 노드 추가
for _, row in data.iterrows():
    level_color = "red" if pd.notna(row["성취수준 A"]) else "orange" if pd.notna(row["성취수준 B"]) else "yellow"
    graph.add_node(
        str(row["f_mchapter_id"]),
        label=row["f_mchapter_nm"],
        tooltip=row["tooltip"],
        color=level_color,
        size=20 if pd.notna(row["성취수준 A"]) else 15 if pd.notna(row["성취수준 B"]) else 10
    )

# 소단원 노드 추가
for _, row in data.iterrows():
    if pd.notna(row["f_schapter_nm"]):
        graph.add_node(
            str(row["f_schapter_id"]),
            label=row["f_schapter_nm"],
            tooltip="소단원",
            color="lightblue",
            size=10
        )
        # 상위 단원과 연결
        graph.add_edge(
            str(row["f_mchapter_id"]),
            str(row["f_schapter_id"]),
            weight=1,
            title="소단원 연결"
        )

# 엣지 추가 (자기 자신 비교 제외)
for i in range(len(data)):
    for j in range(len(data)):
        if i != j and final_sim[i, j] >= threshold:
            graph.add_edge(
                str(data.iloc[i]["f_mchapter_id"]),
                str(data.iloc[j]["f_mchapter_id"]),
                weight=final_sim[i, j],
                title=f"유사도: {final_sim[i, j]:.2f}"
            )

# 5. Pyvis 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)
net.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -3000,
      "centralGravity": 0.3,
      "springLength": 200,
      "springConstant": 0.05
    },
    "minVelocity": 0.1
  }
}
""")

# HTML 저장
output_file = "new_knowledge_graph_v2.html"
net.write_html(output_file)
print(f"HTML 파일이 '{output_file}'로 저장되었습니다.")


HTML 파일이 'new_knowledge_graph_v2.html'로 저장되었습니다.


### 유사도 기반 + 성취기준 기반 선후행 관계 포함

In [60]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv"  # Replace with your file path
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_nm", "f_mchapter_id", "성취기준코드", "성취기준 내용", 
    "성취수준 A", "성취수준 B", "성취수준 C", "f_schapter_id", 
    "f_schapter_nm", "핵심키워드_v2"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data = data.drop_duplicates(subset=["f_mchapter_nm", "성취기준 내용", "핵심키워드_v2"])
data["combined_text"] = (
    data["f_mchapter_nm"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True) +
    " " +
    data["성취기준 내용"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True) +
    " " +
    data["핵심키워드_v2"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True)
)
data["tooltip"] = (
    "소단원명: " + data["f_schapter_nm"].fillna("") +
    "<br>성취수준 A: " + data["성취수준 A"].fillna("") +
    "<br>성취수준 B: " + data["성취수준 B"].fillna("") +
    "<br>성취수준 C: " + data["성취수준 C"].fillna("")
)

# Calculate similarities
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(data["combined_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(embeddings.cpu(), embeddings.cpu())

# Calculate TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["combined_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim
threshold = 0.75

# Create the graph
graph = nx.DiGraph()

# Add nodes for 성취기준코드 and 단원
for _, row in data.iterrows():
    graph.add_node(
        row["성취기준코드"],
        label=row["f_mchapter_nm"] + " - " + row["성취기준코드"],
        tooltip=row["tooltip"],
        color="red",
        size=20
    )

# Add predefined edges based on 성취기준 선후행 관계
predefined_edges = [
    ("2수01-01", "2수01-02"),
    ("2수01-02", "2수01-03"),
    ("2수01-03", "2수01-04"),
    ("2수01-05", "2수01-06"),
    ("2수01-06", "2수01-07"),
    ("2수01-07", "2수01-08"),
    ("2수01-08", "2수01-09"),
    ("2수01-10", "2수01-11"),
    ("2수02-01", "2수02-02"),
    ("2수03-01", "2수03-02"),
    ("2수03-03", "2수03-04"),
    ("2수03-04", "2수03-05"),
    ("2수03-06", "2수03-07"),
    ("2수03-07", "2수03-08"),
    ("2수03-08", "2수03-09"),
    ("2수03-10", "2수03-11"),
    ("2수03-11", "2수03-12"),
    ("2수03-12", "2수03-13"),
    ("2수04-01", "2수04-02"),
    ("2수04-02", "2수04-03")
]
for edge in predefined_edges:
    graph.add_edge(edge[0], edge[1], weight=1.0, title="선후행 관계")

# Add similarity-based edges (유사도 기반 연결)
for i in range(len(data)):
    for j in range(len(data)):
        if i != j and final_sim[i, j] >= threshold:
            source = data.iloc[i]["성취기준코드"]
            target = data.iloc[j]["성취기준코드"]
            if not graph.has_edge(source, target):
                graph.add_edge(
                    source,
                    target,
                    weight=final_sim[i, j],
                    title=f"유사도: {final_sim[i, j]:.2f}",
                    color="blue",
                    dash=True
                )

# Visualize with PyVis
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# Customize physics for better layout
net.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -2000,
      "centralGravity": 0.4,
      "springLength": 200,
      "springConstant": 0.1
    },
    "minVelocity": 0.5
  }
}
""")

# Save visualization
output_file = "new_knowledge_graph_v3.html"
net.write_html(output_file)
print(f"Integrated graph saved to {output_file}")


Integrated graph saved to new_knowledge_graph_v3.html


In [None]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv"  # Replace with your file path
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_nm", "f_mchapter_id", "성취기준코드", "성취기준 내용", 
    "성취수준 A", "성취수준 B", "성취수준 C", "f_schapter_id", 
    "f_schapter_nm", "핵심키워드_v2"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data = data.drop_duplicates(subset=["f_mchapter_nm", "성취기준 내용", "핵심키워드_v2"])
data["combined_text"] = (
    data["f_mchapter_nm"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True) +
    " " +
    data["성취기준 내용"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True) +
    " " +
    data["핵심키워드_v2"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True)
)
data["tooltip"] = (
    "소단원명: " + data["f_schapter_nm"].fillna("") +
    "<br>성취수준 A: " + data["성취수준 A"].fillna("") +
    "<br>성취수준 B: " + data["성취수준 B"].fillna("") +
    "<br>성취수준 C: " + data["성취수준 C"].fillna("")
)

# Calculate similarities
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(data["combined_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(embeddings.cpu(), embeddings.cpu())

# Calculate TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["combined_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim
threshold = 0.7

# Create the graph
graph = nx.DiGraph()

# Add nodes for 성취기준코드 and 단원
for _, row in data.iterrows():
    graph.add_node(
        row["성취기준코드"],
        label=row["f_mchapter_nm"] + " - " + row["성취기준코드"],
        tooltip=row["tooltip"],
        color="red",
        size=20
    )

# Add predefined edges based on 성취기준 선후행 관계
predefined_edges = [
    ("2수01-01", "2수01-02"),
    ("2수01-02", "2수01-03"),
    ("2수01-03", "2수01-04"),
    ("2수01-05", "2수01-06"),
    ("2수01-06", "2수01-07"),
    ("2수01-07", "2수01-08"),
    ("2수01-08", "2수01-09"),
    ("2수01-10", "2수01-11"),
    ("2수02-01", "2수02-02"),
    ("2수03-01", "2수03-02"),
    ("2수03-03", "2수03-04"),
    ("2수03-04", "2수03-05"),
    ("2수03-06", "2수03-07"),
    ("2수03-07", "2수03-08"),
    ("2수03-08", "2수03-09"),
    ("2수03-10", "2수03-11"),
    ("2수03-11", "2수03-12"),
    ("2수03-12", "2수03-13"),
    ("2수04-01", "2수04-02"),
    ("2수04-02", "2수04-03")
]
for edge in predefined_edges:
    graph.add_edge(edge[0], edge[1], weight=1.0, title="선후행 관계")

# Add similarity-based edges (유사도 기반 연결)
for i in range(len(data)):
    for j in range(len(data)):
        if i != j and final_sim[i, j] >= threshold:
            source = data.iloc[i]["성취기준코드"]
            target = data.iloc[j]["성취기준코드"]
            if not graph.has_edge(source, target):
                graph.add_edge(
                    source,
                    target,
                    weight=final_sim[i, j],
                    title=f"유사도: {final_sim[i, j]:.2f}",
                    color="blue",
                    dash=True
                )

# Visualize with PyVis
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# Customize physics for better layout
net.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -2000,
      "centralGravity": 0.4,
      "springLength": 200,
      "springConstant": 0.1
    },
    "minVelocity": 0.5
  }
}
""")

# Save visualization
output_file = "new_knowledge_graph_v4.html"
net.write_html(output_file)
print(f"Integrated graph saved to {output_file}")


Integrated graph saved to new_knowledge_graph_v4.html


### v2+v4

In [30]:
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv"  # Replace with your file path
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_nm", "f_mchapter_id", "성취기준코드", "성취기준 내용", 
    "성취수준 A", "성취수준 B", "성취수준 C", "f_schapter_id", 
    "f_schapter_nm", "핵심키워드_v2"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data = data.drop_duplicates(subset=["f_mchapter_nm", "성취기준 내용", "핵심키워드_v2"])
data["combined_text"] = (
    data["f_mchapter_nm"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True) +
    " " +
    data["성취기준 내용"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True) +
    " " +
    data["핵심키워드_v2"].str.lower().str.replace(r"[^a-z가-힣0-9\s]", "", regex=True)
)
data["tooltip"] = (
    "소단원명: " + data["f_schapter_nm"].fillna("") +
    "<br>성취수준 A: " + data["성취수준 A"].fillna("") +
    "<br>성취수준 B: " + data["성취수준 B"].fillna("") +
    "<br>성취수준 C: " + data["성취수준 C"].fillna("")
)

# Calculate similarities
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(data["combined_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(embeddings.cpu(), embeddings.cpu()).astype('float64')  # Ensure float64

# Combine similarities
threshold = 0.85  # Threshold for similarity
graph = nx.DiGraph()

# Add nodes for 중단원
for _, row in data.iterrows():
    graph.add_node(
        row["성취기준코드"],
        label=row["f_mchapter_nm"] + " - " + row["성취기준코드"],
        tooltip=row["tooltip"],
        color="green",
        size=20
    )

# Add predefined edges based on 성취기준 선후행 관계
predefined_edges = [
    ("2수01-01", "2수01-02"),
    ("2수01-02", "2수01-03"),
    ("2수01-03", "2수01-04"),
    ("2수01-05", "2수01-06"),
    ("2수01-06", "2수01-07"),
    ("2수01-07", "2수01-08"),
    ("2수01-08", "2수01-09"),
    ("2수01-10", "2수01-11"),
    ("2수02-01", "2수02-02"),
    ("2수03-01", "2수03-02"),
    ("2수03-03", "2수03-04"),
    ("2수03-04", "2수03-05"),
    ("2수03-06", "2수03-07"),
    ("2수03-07", "2수03-08"),
    ("2수03-08", "2수03-09"),
    ("2수03-10", "2수03-11"),
    ("2수03-11", "2수03-12"),
    ("2수03-12", "2수03-13"),
    ("2수04-01", "2수04-02"),
    ("2수04-02", "2수04-03")
]
for edge in predefined_edges:
    graph.add_edge(edge[0], edge[1], weight=1.0, title="선후행 관계", color="blue")

# Add similarity-based edges
for i in range(len(data)):
    for j in range(i + 1, len(data)):  # Only forward connections allowed
        if bert_sim[i, j] >= threshold:
            source = data.iloc[i]["성취기준코드"]
            target = data.iloc[j]["성취기준코드"]
            graph.add_edge(
                source,
                target,
                weight=float(bert_sim[i, j]),  # Ensure float is JSON serializable
                title=f"유사도: {bert_sim[i, j]:.2f}",
                color="red"
            )

# Remove self-loops and duplicated edges
edges_to_remove = []
for u, v in graph.edges():
    if u == v:  # Remove self-loops
        edges_to_remove.append((u, v))
for edge in edges_to_remove:
    graph.remove_edge(*edge)

# Visualize with PyVis
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# Customize physics for better layout
net.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -3000,
      "centralGravity": 0.3,
      "springLength": 150,
      "springConstant": 0.05
    },
    "minVelocity": 0.5
  },
  "nodes": {
    "shape": "dot",
    "size": 20
  },
  "edges": {
    "smooth": {
      "type": "continuous"
    }
  }
}
""")

# Save visualization
output_file = "new_knowledge_graph_v5.html"
net.write_html(output_file)
print(f"Integrated graph saved to {output_file}")


Integrated graph saved to new_knowledge_graph_v5.html


In [31]:
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv"  # Replace with your file path
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = ["f_mchapter_nm", "성취기준코드", "성취기준 내용", "핵심키워드_v2"]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Combine text for similarity calculation
data["combined_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["핵심키워드_v2"].fillna("")
)

# Sort data by 성취기준코드
data = data.sort_values(by="성취기준코드").reset_index(drop=True)

# Perform similarity-based analysis
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(data["combined_text"].tolist(), convert_to_tensor=True)
similarity_matrix = cosine_similarity(embeddings.cpu(), embeddings.cpu())

# Generate 성취기준 흐름 기반 edges
flow_edges = []
for i in range(len(data) - 1):
    flow_edges.append((data.iloc[i]["성취기준코드"], data.iloc[i + 1]["성취기준코드"], "선후행 관계"))

# Generate 유사도 기반 edges
threshold = 0.75  # Set a high threshold for similarity
similarity_edges = []
for i in range(len(data)):
    for j in range(i + 1, len(data)):  # Only forward connections allowed
        if similarity_matrix[i, j] >= threshold:
            similarity_edges.append((data.iloc[i]["성취기준코드"], data.iloc[j]["성취기준코드"], "유사도 기반 연결"))

# Combine edges
all_edges = flow_edges + similarity_edges

# Map 성취기준코드 to 중단원명
code_to_chapter = data.set_index("성취기준코드")["f_mchapter_nm"].to_dict()

# Create the graph
graph = nx.DiGraph()

# Add nodes for 중단원
unique_mchapters = data["f_mchapter_nm"].unique()
for chapter in unique_mchapters:
    graph.add_node(
        chapter,
        label=chapter,
        color="green",
        size=30
    )

# Add edges to the graph
for source, target, edge_type in all_edges:
    source_chapter = code_to_chapter[source]
    target_chapter = code_to_chapter[target]
    if source_chapter != target_chapter:  # Avoid self-loops
        graph.add_edge(
            source_chapter, target_chapter,
            color="blue" if edge_type == "선후행 관계" else "red",
            title=edge_type
        )

# Visualize with PyVis
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# Customize physics for better layout
net.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -2000,
      "centralGravity": 0.4,
      "springLength": 150,
      "springConstant": 0.05
    },
    "minVelocity": 0.5
  }
}
""")

# Save visualization
output_file = "new_knowledge_graph_v6.html"
net.write_html(output_file)
print(f"Optimized graph saved to {output_file}")


Optimized graph saved to new_knowledge_graph_v6.html


### 성취기준 코드 흐름으로만

In [32]:
import pandas as pd
import networkx as nx
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = ["f_mchapter_nm", "성취기준코드"]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess 중단원명
data["f_mchapter_nm"] = (
    data["f_mchapter_nm"]
    .str.strip()  # 공백 제거
    .str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)  # 특수문자 제거
)

# Sort data by 성취기준코드 to establish flow
data = data.sort_values(by="성취기준코드").reset_index(drop=True)

# Map 성취기준코드 to 중단원명
code_to_chapter = data.set_index("성취기준코드")["f_mchapter_nm"].to_dict()

# Generate edges based on 성취기준코드 flow
edges = []
previous_chapter = None
for _, row in data.iterrows():
    current_chapter = row["f_mchapter_nm"]
    if previous_chapter and previous_chapter != current_chapter:
        edges.append((previous_chapter, current_chapter))
    previous_chapter = current_chapter

# Extract unique 중단원명
unique_mchapters = data["f_mchapter_nm"].drop_duplicates().tolist()

# Create the graph
graph = nx.DiGraph()

# Add nodes for 중단원명
for chapter in unique_mchapters:
    graph.add_node(
        chapter,
        label=chapter,
        color="green",
        size=30
    )

# Add edges for 중단원 선후행 관계
for edge in edges:
    graph.add_edge(
        edge[0], edge[1],
        weight=1.0,
        color="blue",
        title="성취기준코드 흐름 기반 중단원 선후행 관계"
    )

# Add edges to ensure all nodes are connected if needed
connected_components = list(nx.weakly_connected_components(graph))
if len(connected_components) > 1:
    for i in range(len(connected_components) - 1):
        source = list(connected_components[i])[0]
        target = list(connected_components[i + 1])[0]
        graph.add_edge(
            source, target,
            weight=0.1,
            color="gray",
            title="추가 연결"
        )

# Visualize with PyVis
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# Customize physics for better layout
net.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -2000,
      "centralGravity": 0.4,
      "springLength": 200,
      "springConstant": 0.1
    },
    "minVelocity": 0.5
  }
}
""")

# Save visualization
output_file = "chapter_by_code_flow_graph.html"
net.write_html(output_file)
print(f"Graph with chapter connections by 성취기준코드 flow saved to {output_file}")


Graph with chapter connections by 성취기준코드 flow saved to chapter_by_code_flow_graph.html


In [33]:
import pandas as pd
import networkx as nx
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = ["f_mchapter_id", "f_mchapter_nm", "성취기준코드"]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess 중단원명
data["f_mchapter_nm"] = (
    data["f_mchapter_nm"]
    .str.strip()  # 공백 제거
    .str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)  # 특수문자 제거
)

# Sort data by 성취기준코드 to establish flow
data = data.sort_values(by="성취기준코드").reset_index(drop=True)

# Map 성취기준코드 to 중단원 ID and Name
code_to_id = data.set_index("성취기준코드")["f_mchapter_id"].to_dict()
id_to_name = data.set_index("f_mchapter_id")["f_mchapter_nm"].to_dict()

# Generate edges based on 성취기준코드 flow
edges = []
previous_id = None
for _, row in data.iterrows():
    current_id = row["f_mchapter_id"]
    if previous_id and previous_id != current_id:
        edges.append((previous_id, current_id))
    previous_id = current_id

# Extract unique 중단원 IDs
unique_ids = data["f_mchapter_id"].drop_duplicates().tolist()

# Create the graph
graph = nx.DiGraph()

# Add nodes for 중단원 ID with labels as 중단원명
for node_id in unique_ids:
    graph.add_node(
        node_id,
        label=id_to_name[node_id],
        color="green",
        size=30
    )

# Add edges for 중단원 선후행 관계
for edge in edges:
    graph.add_edge(
        edge[0], edge[1],
        weight=1.0,
        color="blue",
        title="성취기준코드 흐름 기반 중단원 선후행 관계"
    )

# Add edges to ensure all nodes are connected if needed
connected_components = list(nx.weakly_connected_components(graph))
if len(connected_components) > 1:
    for i in range(len(connected_components) - 1):
        source = list(connected_components[i])[0]
        target = list(connected_components[i + 1])[0]
        graph.add_edge(
            source, target,
            weight=0.1,
            color="gray",
            title="추가 연결"
        )

# Visualize with PyVis
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# Customize physics for better layout
net.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -2000,
      "centralGravity": 0.4,
      "springLength": 200,
      "springConstant": 0.1
    },
    "minVelocity": 0.5
  }
}
""")

# Save visualization
output_file = "chapter_by_code_flow_graph_with_id.html"
net.write_html(output_file)
print(f"Graph with chapter connections by f_mchapter_id saved to {output_file}")


Graph with chapter connections by f_mchapter_id saved to chapter_by_code_flow_graph_with_id.html


### 위의 코드로 돌린 후, 중단원명이 다 표현되는지 확인하는 코드

In [53]:
# 그래프의 노드 이름 확인
graph_nodes = [node for node in graph.nodes]
print("그래프에 포함된 중단원 개수:", len(graph_nodes))
print("그래프에 포함된 중단원:", graph_nodes)

# 데이터에서 고유한 중단원 확인
unique_mchapters = data["f_mchapter_nm"].unique()
print("데이터에서 고유한 중단원 개수:", len(unique_mchapters))
print("데이터에서 고유한 중단원:", unique_mchapters)

# 누락된 중단원 확인
missing_chapters = set(unique_mchapters) - set(graph_nodes)
if missing_chapters:
    print("누락된 중단원:", missing_chapters)
else:
    print("모든 중단원이 그래프에 포함되어 있습니다.")


그래프에 포함된 중단원 개수: 94
그래프에 포함된 중단원: ['14201779', '14201780', '14201781', '14201782', '14201783', '14201784', '14201785', '14201786', '14201787', '14201788', '14201789', '14201790', '14201791', '14201792', '14201793', '14201794', '14201795', '14201796', '14201797', '14201798', '14201799', '14201800', '14201801', '14201802', '14201803', '14201804', '14201805', '14201806', '14201807', '14201808', '14201809', '14201810', '14201811', '14201812', '14201813', '14201814', '14201815', '14201816', '14201817', '14201818', '14201819', '14201820', '14201821', '14201857', '14201858', '14201859', '14201860', '14201861', '14201862', '14201863', '14201864', '14201865', '14201866', '14201867', '14201868', '14201869', '14201870', '14201871', '14201872', '14201873', '14201874', '14201875', '14201876', '14201877', '14201878', '14201879', '14201880', '14201881', '14201882', '14201883', '14201884', '14201885', '14201886', '14201887', '14201888', '14201889', '14201890', '14201891', '14201892', '14201893', '1420

### Sentence-BERT와 TF-IDF 입력 데이터 분리 + 유사도 가장 높은 거 하나만 연결

In [47]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_id", "f_mchapter_nm", "성취기준코드", "성취기준 내용",
    "핵심키워드_v2", "성취수준 A", "성취수준 B", "성취수준 C"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)
data["tfidf_text"] = data["핵심키워드_v2"].fillna("")

# Sentence-BERT similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["tfidf_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 성취기준 흐름 기반 관계 생성
flow_edges = []
previous_id = None
for _, row in data.iterrows():
    current_id = row["f_mchapter_id"]
    if previous_id and previous_id != current_id:
        flow_edges.append((previous_id, current_id))
    previous_id = current_id

# 유사도 기반 관계 생성 (가장 높은 유사도 하나만 유지)
threshold = 0.95
similarity_edges = []
node_max_connections = {}
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if final_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                if source_id not in node_max_connections or node_max_connections[source_id][1] < final_sim[i, j]:
                    node_max_connections[source_id] = (target_id, final_sim[i, j])

for source_id, (target_id, weight) in node_max_connections.items():
    similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 노드 추가
for chapter_id, chapter_name in zip(data["f_mchapter_id"], data["f_mchapter_nm"]):
    graph.add_node(
        str(chapter_id),  # ID는 문자열로 변환
        label=chapter_name,
        color="green",
        size=30
    )

# 엣지 추가
for source, target in final_edges:
    graph.add_edge(
        str(source), str(target),
        color="blue",
        title="선후행 관계 및 유사도 기반 연결"
    )

# PyVis로 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true, 
    "stabilization": {
      "enabled": true, 
      "iterations": 1000, 
      "fit": true
    },
    "barnesHut": {
      "gravitationalConstant": -8000, 
      "centralGravity": 0.3, 
      "springLength": 500, 
      "springConstant": 0.05
    },
    "minVelocity": 0.1
  },
  "nodes": {
    "shape": "dot",
    "size": 20
  },
  "edges": {
    "smooth": {
      "type": "continuous"
    }
  }
}
''')

# Save visualization
output_file = "final_chapter_graph.html"
net.write_html(output_file)
print(f"Graph with integrated flow and similarity saved to {output_file}")


Graph with integrated flow and similarity saved to final_chapter_graph.html


### Sentence-BERT와 TF-IDF 입력 데이터 분리 + 유사도 가장 높은 거 하나만 연결 + 학년학기별 색상 부여

In [None]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_id", "f_mchapter_nm", "f_subject_id", "성취기준코드", "성취기준 내용",
    "핵심키워드_v2", "성취수준 A", "성취수준 B", "성취수준 C"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)
data["tfidf_text"] = data["핵심키워드_v2"].fillna("")

# Sentence-BERT similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["tfidf_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 성취기준 흐름 기반 관계 생성
flow_edges = []
previous_id = None
for _, row in data.iterrows():
    current_id = row["f_mchapter_id"]
    if previous_id and previous_id != current_id:
        flow_edges.append((previous_id, current_id))
    previous_id = current_id

# 유사도 기반 관계 생성 (가장 높은 유사도 하나만 유지)
threshold = 0.95
similarity_edges = []
node_max_connections = {}
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if final_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                if source_id not in node_max_connections or node_max_connections[source_id][1] < final_sim[i, j]:
                    node_max_connections[source_id] = (target_id, final_sim[i, j])

for source_id, (target_id, weight) in node_max_connections.items():
    similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 노드 색상 매핑
def get_node_color(subject_id):
    color_map = {
        2212: "red",
        2213: "blue",
        2214: "green",
        2215: "yellow"
    }
    return color_map.get(subject_id, "gray")

# 노드 추가
for chapter_id, chapter_name, subject_id in zip(data["f_mchapter_id"], data["f_mchapter_nm"], data["f_subject_id"]):
    graph.add_node(
        str(chapter_id),  # ID는 문자열로 변환
        label=chapter_name,
        color=get_node_color(subject_id),
        size=30
    )

# 엣지 추가
for source, target in final_edges:
    graph.add_edge(
        str(source), str(target),
        color="blue",
        title="선후행 관계 및 유사도 기반 연결"
    )

# PyVis로 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true, 
    "stabilization": {
      "enabled": true, 
      "iterations": 1000, 
      "fit": true
    },
    "barnesHut": {
      "gravitationalConstant": -8000, 
      "centralGravity": 0.3, 
      "springLength": 500, 
      "springConstant": 0.05
    },
    "minVelocity": 0.1
  },
  "nodes": {
    "shape": "dot",
    "size": 20
  },
  "edges": {
    "smooth": {
      "type": "continuous"
    }
  }
}
''')

# Save visualization
output_file = "final_chapter_graph2.html"
net.write_html(output_file)
print(f"Graph with integrated flow and similarity saved to {output_file}")


Graph with integrated flow and similarity saved to final_chapter_graph2.html


### final_chapter_graph2 + 색상별 즉 학년 학기별 군집화 = final_chapter_graph3

In [76]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_id", "f_mchapter_nm", "f_subject_id", "성취기준코드", "성취기준 내용",
    "핵심키워드_v2", "성취수준 A", "성취수준 B", "성취수준 C"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)
data["tfidf_text"] = data["핵심키워드_v2"].fillna("")

# Sentence-BERT similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["tfidf_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 성취기준 흐름 기반 관계 생성
flow_edges = []
previous_id = None
for _, row in data.iterrows():
    current_id = row["f_mchapter_id"]
    if previous_id and previous_id != current_id:
        flow_edges.append((previous_id, current_id))
    previous_id = current_id

# 유사도 기반 관계 생성
threshold = 0.95
similarity_edges = []
node_max_connections = {}
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if final_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                if source_id not in node_max_connections or node_max_connections[source_id][1] < final_sim[i, j]:
                    node_max_connections[source_id] = (target_id, final_sim[i, j])

for source_id, (target_id, weight) in node_max_connections.items():
    similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 색상 및 그룹 설정
def get_node_color(subject_id):
    color_map = {
        2212: "#FF4444",  # 더 선명한 빨강
        2213: "#4444FF",  # 더 선명한 파랑
        2214: "#44FF44",  # 더 선명한 초록
        2215: "#FFFF44"   # 더 선명한 노랑
    }
    return color_map.get(subject_id, "#GRAY")

# 노드를 subject_id별로 그룹화
grouped_data = data.groupby("f_subject_id")
y_positions = {sid: idx * 100 for idx, sid in enumerate(data["f_subject_id"].unique())}

# 노드 추가 (위치 정보 포함)
for chapter_id, chapter_name, subject_id in zip(data["f_mchapter_id"], data["f_mchapter_nm"], data["f_subject_id"]):
    graph.add_node(
        str(chapter_id),
        label=chapter_name,
        color=get_node_color(subject_id),
        size=30,
        group=str(subject_id)  # 그룹 설정
    )

# 엣지 추가
for source, target in final_edges:
    graph.add_edge(
        str(source), str(target),
        color="#aaaaff",  # 연한 파란색으로 변경
        width=1,
        title="선후행 관계 및 유사도 기반 연결"
    )

# PyVis로 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true,
    "forceAtlas2Based": {
      "gravitationalConstant": -100,
      "centralGravity": 0.01,
      "springLength": 200,
      "springConstant": 0.08,
      "damping": 0.4,
      "avoidOverlap": 1
    },
    "solver": "forceAtlas2Based",
    "stabilization": {
      "enabled": true,
      "iterations": 2000,
      "updateInterval": 25
    }
  },
  "nodes": {
    "shape": "dot",
    "size": 25,
    "font": {
      "size": 14
    }
  },
  "edges": {
    "smooth": {
      "type": "continuous",
      "forceDirection": "none"
    },
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    }
  },
  "groups": {
    "2212": {"color": "#FF4444"},
    "2213": {"color": "#4444FF"},
    "2214": {"color": "#44FF44"},
    "2215": {"color": "#FFFF44"}
  }
}
''')

# Save visualization
output_file = "final_chapter_graph3.html"
net.write_html(output_file)
print(f"Graph with integrated flow and similarity saved to {output_file}")

Graph with integrated flow and similarity saved to final_chapter_graph3.html


### bert embedding으로 군집화

In [86]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network
from sklearn.cluster import KMeans
import numpy as np
from sklearn.manifold import TSNE

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)

# BERT 임베딩 생성
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
embeddings_np = bert_embeddings.cpu().numpy()

# K-means 군집화
n_clusters = 4  # subject_id 개수와 동일하게 설정
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings_np)

# t-SNE로 차원 축소 (전체 데이터에 대해 한 번만 실행)
tsne = TSNE(
    n_components=2,
    perplexity=min(30, len(embeddings_np) - 1),  # perplexity 값 조정
    random_state=42,
    n_iter=1000
)
embeddings_2d = tsne.fit_transform(embeddings_np)

# 성취기준 흐름 기반 관계 생성
flow_edges = []
previous_id = None
for _, row in data.iterrows():
    current_id = row["f_mchapter_id"]
    if previous_id and previous_id != current_id:
        flow_edges.append((previous_id, current_id))
    previous_id = current_id

# 유사도 기반 관계 생성
cosine_sim = cosine_similarity(embeddings_np)
threshold = 0.95
similarity_edges = []
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if cosine_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 군집별 색상 매핑
cluster_colors = ['#FF4444', '#4444FF', '#44FF44', '#FFFF44']

# 노드 추가 (임베딩 기반 위치 정보 포함)
for idx, (chapter_id, chapter_name) in enumerate(zip(data["f_mchapter_id"], data["f_mchapter_nm"])):
    # t-SNE 좌표를 기반으로 위치 설정
    x, y = embeddings_2d[idx]
    
    # 스케일 조정 (더 넓은 공간에 분포하도록)
    x = x * 10
    y = y * 10
    
    graph.add_node(
        str(chapter_id),
        label=chapter_name,
        color=cluster_colors[cluster_labels[idx]],
        size=30,
        x=float(x),
        y=float(y),
        group=str(cluster_labels[idx])
    )

# 엣지 추가
for source, target in final_edges:
    graph.add_edge(
        str(source), str(target),
        color="#aaaaff",
        width=1,
        title="선후행 관계 및 유사도 기반 연결"
    )

# PyVis로 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true,
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.005,
      "springLength": 100,
      "springConstant": 0.04,
      "damping": 0.4,
      "avoidOverlap": 1
    },
    "solver": "forceAtlas2Based",
    "stabilization": {
      "enabled": true,
      "iterations": 2000,
      "updateInterval": 25
    }
  },
  "nodes": {
    "shape": "dot",
    "size": 25,
    "font": {
      "size": 14
    }
  },
  "edges": {
    "smooth": {
      "type": "continuous",
      "forceDirection": "none"
    },
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    }
  },
  "groups": {
    "0": {"color": "#FF4444"},
    "1": {"color": "#4444FF"},
    "2": {"color": "#44FF44"},
    "3": {"color": "#FFFF44"}
  }
}
''')

# 군집화 결과 데이터프레임 생성
cluster_info = pd.DataFrame({
    'chapter_id': data['f_mchapter_id'],
    'chapter_name': data['f_mchapter_nm'],
    'subject_id': data['f_subject_id'],
    'cluster': cluster_labels
})

# 중복 제거 (chapter_name 기준으로 고유값만 유지)
cluster_info_unique = cluster_info.drop_duplicates(subset=['chapter_name'])

# 군집별 데이터 요약 출력
print("\n군집화 결과 분석:")
for cluster in range(n_clusters):
    cluster_data = cluster_info_unique[cluster_info_unique['cluster'] == cluster]
    print(f"\n클러스터 {cluster} ({len(cluster_data)} 개 노드):")
    print(cluster_data[['chapter_name', 'subject_id']].to_string(index=False))

# 저장할 경우 (옵션)
output_file = "clustered_results.csv"
cluster_info_unique.to_csv(output_file, index=False)
print(f"\n정리된 군집화 결과를 {output_file}에 저장했습니다.")


# Save visualization
output_file_html = "final_chapter_graph_clustered.html"
net.write_html(output_file_html)
print(f"\nGraph with embedding-based clustering saved to {output_file_html}")





군집화 결과 분석:

클러스터 0 (11 개 노드):
chapter_name  subject_id
     두수의크기비교        2212
       덧셈과뺄셈        2212
    덧셈과뺄셈의관계        2214
       의값구하기        2214
       세수의계산        2214
   받아올림이있는덧셈        2213
   받아내림이있는뺄셈        2213
       덧셈하기2        2213
       덧셈하기3        2213
       뺄셈하기2        2213
       뺄셈하기3        2213

클러스터 1 (31 개 노드):
       chapter_name  subject_id
          12345알아보기        2212
           6789알아보기        2212
1만큼더큰수와1만큼더작은수0알아보기        2212
        덧셈과뺄셈이야기만들기        2212
                 덧셈        2212
                 뺄셈        2212
             10알아보기        2212
             십몇알아보기        2212
               묶어세기        2214
           곱셈식을활용하기        2214
             몇십알아보기        2213
         99까지의수알아보기        2213
           한자리세수의계산        2213
             두수를더하기        2213
           10이되는더하기        2213
             10에서빼기        2213
          10을만들어더하기        2213
          몇시30분알아보기        2213
         몇시몇시30분의응용        2213
          

### 원래 유사도 검사 + k-means 고정 & n_init 수 증가

In [91]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network
from sklearn.cluster import KMeans
import numpy as np
from sklearn.manifold import TSNE

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_id", "f_mchapter_nm", "f_subject_id", "성취기준코드", "성취기준 내용",
    "핵심키워드_v2", "성취수준 A", "성취수준 B", "성취수준 C"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)
data["tfidf_text"] = data["핵심키워드_v2"].fillna("")

# Sentence-BERT similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["tfidf_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 성취기준 흐름 기반 관계 생성
flow_edges = []
previous_id = None
for _, row in data.iterrows():
    current_id = row["f_mchapter_id"]
    if previous_id and previous_id != current_id:
        flow_edges.append((previous_id, current_id))
    previous_id = current_id

# 유사도 기반 관계 생성 (가장 높은 유사도 하나만 유지)
threshold = 0.95
similarity_edges = []
node_max_connections = {}
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if final_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                if source_id not in node_max_connections or node_max_connections[source_id][1] < final_sim[i, j]:
                    node_max_connections[source_id] = (target_id, final_sim[i, j])

for source_id, (target_id, weight) in node_max_connections.items():
    similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 군집별 색상 매핑
cluster_colors = ['#FF4444', '#4444FF', '#44FF44', '#FFFF44']

# K-means 군집화
n_clusters = 4  # subject_id 개수와 동일하게 설정
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)
cluster_labels = kmeans.fit_predict(bert_embeddings.cpu().numpy())

# t-SNE로 차원 축소 (전체 데이터에 대해 한 번만 실행)
tsne = TSNE(
    n_components=2,
    perplexity=min(30, len(bert_embeddings) - 1),  # perplexity 값 조정
    random_state=42,
    n_iter=1000
)
embeddings_2d = tsne.fit_transform(bert_embeddings.cpu().numpy())

# 노드 추가 (임베딩 기반 위치 정보 포함)
for idx, (chapter_id, chapter_name) in enumerate(zip(data["f_mchapter_id"], data["f_mchapter_nm"])):
    # t-SNE 좌표를 기반으로 위치 설정
    x, y = embeddings_2d[idx]
    
    # 스케일 조정 (더 넓은 공간에 분포하도록)
    x = x * 10
    y = y * 10
    
    graph.add_node(
        str(chapter_id),
        label=chapter_name,
        color=cluster_colors[cluster_labels[idx]],
        size=30,
        x=float(x),
        y=float(y),
        group=str(cluster_labels[idx])
    )

# 엣지 추가
for source, target in final_edges:
    graph.add_edge(
        str(source), str(target),
        color="#aaaaff",
        width=1,
        title="선후행 관계 및 유사도 기반 연결"
    )

# PyVis로 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true,
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.005,
      "springLength": 100,
      "springConstant": 0.04,
      "damping": 0.4,
      "avoidOverlap": 1
    },
    "solver": "forceAtlas2Based",
    "stabilization": {
      "enabled": true,
      "iterations": 2000,
      "updateInterval": 25
    }
  },
  "nodes": {
    "shape": "dot",
    "size": 25,
    "font": {
      "size": 14
    }
  },
  "edges": {
    "smooth": {
      "type": "continuous",
      "forceDirection": "none"
    },
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    }
  },
  "groups": {
    "0": {"color": "#FF4444"},
    "1": {"color": "#4444FF"},
    "2": {"color": "#44FF44"},
    "3": {"color": "#FFFF44"}
  }
}
''')

# 군집화 결과 데이터프레임 생성
cluster_info = pd.DataFrame({
    'chapter_id': data['f_mchapter_id'],
    'chapter_name': data['f_mchapter_nm'],
    'subject_id': data['f_subject_id'],
    'cluster': cluster_labels
})

# 중복 제거 (chapter_name 기준으로 고유값만 유지)
cluster_info_unique = cluster_info.drop_duplicates(subset=['chapter_name'])

# 군집별 데이터 요약 출력
print("\n군집화 결과 분석:")
for cluster in range(n_clusters):
    cluster_data = cluster_info_unique[cluster_info_unique['cluster'] == cluster]
    print(f"\n클러스터 {cluster} ({len(cluster_data)} 개 노드):")
    print(cluster_data[['chapter_name', 'subject_id']].to_string(index=False))

# 저장할 경우 (옵션)
output_file = "clustered_results.csv"
cluster_info_unique.to_csv(output_file, index=False)
print(f"\n정리된 군집화 결과를 {output_file}에 저장했습니다.")

# Save visualization
output_file_html = "final_chapter_graph_clustered2.html"
net.write_html(output_file_html)
print(f"\nGraph with embedding-based clustering saved to {output_file_html}")





군집화 결과 분석:

클러스터 0 (61 개 노드):
       chapter_name  subject_id
          12345알아보기        2212
           6789알아보기        2212
            9까지수의순서        2212
1만큼더큰수와1만큼더작은수0알아보기        2212
         여러가지모양찾아보기        2212
         여러가지모양알아보기        2212
        덧셈과뺄셈이야기만들기        2212
                 덧셈        2212
                 뺄셈        2212
             길이비교하기        2212
             무게비교하기        2212
             넓이비교하기        2212
         담을수있는양비교하기        2212
             높이비교하기        2212
              키비교하기        2212
             10알아보기        2212
             십몇알아보기        2212
          50까지의수의순서        2212
               뛰어세기        2214
                 도형        2214
               쌓기나무        2214
        여러가지단위길이로재기        2214
             자로길이재기        2214
       길이를어림해보고재어보기        2214
               분류하기        2214
               묶어세기        2214
            곱셈식알아보기        2214
           곱셈식을활용하기        2214
             몇십알아보기        2213
         

### 화살표 방향 : 시간의 흐름, 즉 학년 학기 순을 거스르지 않아야 함!

In [95]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network
from sklearn.cluster import KMeans
import numpy as np
from sklearn.manifold import TSNE

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_id", "f_mchapter_nm", "f_subject_id", "성취기준코드", "성취기준 내용",
    "핵심키워드_v2", "성취수준 A", "성취수준 B", "성취수준 C"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)
data["tfidf_text"] = data["핵심키워드_v2"].fillna("")

# Sentence-BERT similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["tfidf_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 성취기준 흐름 기반 관계 생성 (f_subject_id를 고려한 방향 설정)
flow_edges = []
for i in range(len(data) - 1):
    current_row = data.iloc[i]
    next_row = data.iloc[i + 1]
    
    # f_subject_id 값 기준으로 방향 설정
    if current_row["f_subject_id"] < next_row["f_subject_id"]:
        flow_edges.append((current_row["f_mchapter_id"], next_row["f_mchapter_id"]))

# 유사도 기반 관계 생성 (가장 높은 유사도 하나만 유지)
threshold = 0.95
similarity_edges = []
node_max_connections = {}
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if final_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                if source_id not in node_max_connections or node_max_connections[source_id][1] < final_sim[i, j]:
                    node_max_connections[source_id] = (target_id, final_sim[i, j])

for source_id, (target_id, weight) in node_max_connections.items():
    similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 군집별 색상 매핑
cluster_colors = ['#FF4444', '#4444FF', '#44FF44', '#FFFF44']

# K-means 군집화
n_clusters = 4  # subject_id 개수와 동일하게 설정
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)
cluster_labels = kmeans.fit_predict(bert_embeddings.cpu().numpy())

# t-SNE로 차원 축소 (전체 데이터에 대해 한 번만 실행)
tsne = TSNE(
    n_components=2,
    perplexity=min(30, len(bert_embeddings) - 1),  # perplexity 값 조정
    random_state=42,
    n_iter=1000
)
embeddings_2d = tsne.fit_transform(bert_embeddings.cpu().numpy())

# 노드 추가 (임베딩 기반 위치 정보 포함)
for idx, (chapter_id, chapter_name) in enumerate(zip(data["f_mchapter_id"], data["f_mchapter_nm"])):
    # t-SNE 좌표를 기반으로 위치 설정
    x, y = embeddings_2d[idx]
    
    # 스케일 조정 (더 넓은 공간에 분포하도록)
    x = x * 10
    y = y * 10
    
    graph.add_node(
        str(chapter_id),
        label=chapter_name,
        color=cluster_colors[cluster_labels[idx]],
        size=30,
        x=float(x),
        y=float(y),
        group=str(cluster_labels[idx])
    )

# 엣지 추가
for source, target in final_edges:
    graph.add_edge(
        str(source), str(target),
        color="#aaaaff",
        width=1,
        title="선후행 관계 및 유사도 기반 연결"
    )

# PyVis로 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true,
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.005,
      "springLength": 100,
      "springConstant": 0.04,
      "damping": 0.4,
      "avoidOverlap": 1
    },
    "solver": "forceAtlas2Based",
    "stabilization": {
      "enabled": true,
      "iterations": 2000,
      "updateInterval": 25
    }
  },
  "nodes": {
    "shape": "dot",
    "size": 25,
    "font": {
      "size": 14
    }
  },
  "edges": {
    "smooth": {
      "type": "continuous",
      "forceDirection": "none"
    },
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    }
  },
  "groups": {
    "0": {"color": "#FF4444"},
    "1": {"color": "#4444FF"},
    "2": {"color": "#44FF44"},
    "3": {"color": "#FFFF44"}
  }
}
''')

# 군집화 결과 데이터프레임 생성
cluster_info = pd.DataFrame({
    'chapter_id': data['f_mchapter_id'],
    'chapter_name': data['f_mchapter_nm'],
    'subject_id': data['f_subject_id'],
    'cluster': cluster_labels
})

# 중복 제거 (chapter_name 기준으로 고유값만 유지)
cluster_info_unique = cluster_info.drop_duplicates(subset=['chapter_name'])

# 군집별 데이터 요약 출력
print("\n군집화 결과 분석:")
for cluster in range(n_clusters):
    cluster_data = cluster_info_unique[cluster_info_unique['cluster'] == cluster]
    print(f"\n클러스터 {cluster} ({len(cluster_data)} 개 노드):")
    print(cluster_data[['chapter_name', 'subject_id']].to_string(index=False))

# 저장할 경우 (옵션)
output_file = "clustered_results3.csv"
cluster_info_unique.to_csv(output_file, index=False)
print(f"\n정리된 군집화 결과를 {output_file}에 저장했습니다.")

# Save visualization
output_file_html = "final_chapter_graph_clustered3.html"
net.write_html(output_file_html)
print(f"\nGraph with embedding-based clustering saved to {output_file_html}")





군집화 결과 분석:

클러스터 0 (61 개 노드):
       chapter_name  subject_id
          12345알아보기        2212
           6789알아보기        2212
            9까지수의순서        2212
1만큼더큰수와1만큼더작은수0알아보기        2212
         여러가지모양찾아보기        2212
         여러가지모양알아보기        2212
        덧셈과뺄셈이야기만들기        2212
                 덧셈        2212
                 뺄셈        2212
             길이비교하기        2212
             무게비교하기        2212
             넓이비교하기        2212
         담을수있는양비교하기        2212
             높이비교하기        2212
              키비교하기        2212
             10알아보기        2212
             십몇알아보기        2212
          50까지의수의순서        2212
               뛰어세기        2214
                 도형        2214
               쌓기나무        2214
        여러가지단위길이로재기        2214
             자로길이재기        2214
       길이를어림해보고재어보기        2214
               분류하기        2214
               묶어세기        2214
            곱셈식알아보기        2214
           곱셈식을활용하기        2214
             몇십알아보기        2213
         

### 화살표 방향(역순흐름X) & 2022년 개정 수학 교육부 성취기준 4개 영역(수와연산,변화와관계,도형과측정,자료와가능성)

In [62]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network
import json

# Load your data
file_path = "merged_final_data_new - 복사본.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_id", "f_mchapter_nm", "f_subject_id", "성취기준코드", "성취기준 내용",
    "핵심키워드_v2", "성취수준 A", "성취수준 B", "성취수준 C"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)
data["tfidf_text"] = data["핵심키워드_v2"].fillna("")

# 군집화 및 학년 학기 색상 매핑
area_colors = {
    "수와 연산": "#FFCCCC",
    "변화와 관계": "#CCCCFF",
    "도형과 측정": "#CCFFCC",
    "자료와 가능성": "#FFFFCC",
    "기타": "#CCCCCC"
}
subject_colors = {
    2212: "#FF0000",  # 1학년 1학기
    2213: "#0000FF",  # 1학년 2학기
    2214: "#00FF00",  # 2학년 1학기
    2215: "#FFFF00"   # 2학년 2학기
}

def get_area_by_code(code):
    if code.startswith("2수01"):
        return "수와 연산"
    elif code.startswith("2수02"):
        return "변화와 관계"
    elif code.startswith("2수03"):
        return "도형과 측정"
    elif code.startswith("2수04"):
        return "자료와 가능성"
    else:
        return "기타"

data["area"] = data["성취기준코드"].apply(get_area_by_code)
data["color"] = data["area"].map(area_colors)
data["border_color"] = data["f_subject_id"].map(subject_colors)

# Sentence-BERT similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["tfidf_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 흐름 기반 엣지 생성
flow_edges = []
for i in range(len(data) - 1):
    current_row = data.iloc[i]
    next_row = data.iloc[i + 1]
    if (current_row["f_subject_id"] < next_row["f_subject_id"] or
        (current_row["f_subject_id"] == next_row["f_subject_id"] and
         current_row["성취기준코드"] < next_row["성취기준코드"])):
        flow_edges.append((current_row["f_mchapter_id"], next_row["f_mchapter_id"]))

# 유사도 기반 관계 생성
threshold = 0.85
similarity_edges = []
node_max_connections = {}
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if final_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                if source_id not in node_max_connections or node_max_connections[source_id][1] < final_sim[i, j]:
                    node_max_connections[source_id] = (target_id, final_sim[i, j])

for source_id, (target_id, weight) in node_max_connections.items():
    similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 노드 추가
nodes_json = []
for idx, (chapter_id, chapter_name, color, border_color) in enumerate(zip(
    data["f_mchapter_id"], data["f_mchapter_nm"], data["color"], data["border_color"]
)):
    graph.add_node(
        str(chapter_id),
        label=chapter_name,
        color={
            "background": color,
            "border": border_color
        },
        size=30,
        borderWidth=2
    )
    nodes_json.append({
        "id": chapter_id,
        "name": chapter_name,
        "area": data.loc[idx, "area"],
        "subject_id": data.loc[idx, "f_subject_id"],
        "color": color,
        "border_color": border_color
    })

# 엣지 추가
edges_json = []
for source, target in final_edges:
    if str(source) in graph.nodes and str(target) in graph.nodes and source != target:
        graph.add_edge(
            str(source), str(target),
            color="#aaaaff",
            width=1,
            title="선후행 관계 및 유사도 기반 연결"
        )
        edges_json.append({
            "source": source,
            "target": target
        })

# PyVis 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true,
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.005,
      "springLength": 100,
      "springConstant": 0.04,
      "damping": 0.4,
      "avoidOverlap": 1
    },
    "solver": "forceAtlas2Based",
    "stabilization": {
      "enabled": true,
      "iterations": 2000,
      "updateInterval": 25
    }
  },
  "nodes": {
    "shape": "dot",
    "size": 25,
    "font": {
      "size": 14
    }
  },
  "edges": {
    "smooth": {
      "type": "continuous",
      "forceDirection": "none"
    },
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    }
  }
}
''')

# 그래프 저장
output_file_html = "final_chapter_graph_clustered4.html"
net.write_html(output_file_html)
print(f"\nGraph with embedding-based clustering saved to {output_file_html}")




Graph with embedding-based clustering saved to final_chapter_graph_clustered4.html


In [1]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network
import json

# Load your data
file_path = "merged_final_data_new.csv"
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_id", "f_mchapter_nm", "f_subject_id", "성취기준코드", "성취기준 내용",
    "핵심키워드_v2", "성취수준 A", "성취수준 B", "성취수준 C"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)
data["tfidf_text"] = data["핵심키워드_v2"].fillna("")

# 군집화 및 학년 학기 색상 매핑
area_colors = {
    "수와 연산": "#FFCCCC",
    "변화와 관계": "#CCCCFF",
    "도형과 측정": "#CCFFCC",
    "자료와 가능성": "#FFFFCC",
    "기타": "#CCCCCC"
}
subject_colors = {
    2212: "#FF0000",  # 1학년 1학기
    2213: "#0000FF",  # 1학년 2학기
    2214: "#00FF00",  # 2학년 1학기
    2215: "#FFFF00"   # 2학년 2학기
}

def get_area_by_code(code):
    if code.startswith("2수01"):
        return "수와 연산"
    elif code.startswith("2수02"):
        return "변화와 관계"
    elif code.startswith("2수03"):
        return "도형과 측정"
    elif code.startswith("2수04"):
        return "자료와 가능성"
    else:
        return "기타"

data["area"] = data["성취기준코드"].apply(get_area_by_code)
data["color"] = data["area"].map(area_colors)
data["border_color"] = data["f_subject_id"].map(subject_colors)

# Sentence-BERT similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["tfidf_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 흐름 기반 엣지 생성
flow_edges = []
for i in range(len(data) - 1):
    current_row = data.iloc[i]
    next_row = data.iloc[i + 1]
    if (current_row["f_subject_id"] < next_row["f_subject_id"] or
        (current_row["f_subject_id"] == next_row["f_subject_id"] and
         current_row["성취기준코드"] < next_row["성취기준코드"])): 
        flow_edges.append((current_row["f_mchapter_id"], next_row["f_mchapter_id"]))

# 유사도 기반 관계 생성
threshold = 0.85
similarity_edges = []
node_max_connections = {}
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if final_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                if source_id not in node_max_connections or node_max_connections[source_id][1] < final_sim[i, j]:
                    node_max_connections[source_id] = (target_id, final_sim[i, j])

for source_id, (target_id, weight) in node_max_connections.items():
    similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 노드 추가
nodes_json = []
for idx, (chapter_id, chapter_name, color, border_color) in enumerate(zip(
    data["f_mchapter_id"], data["f_mchapter_nm"], data["color"], data["border_color"]
)):
    graph.add_node(
        str(chapter_id),
        label=chapter_name,
        color={
            "background": color,
            "border": border_color
        },
        size=30,
        borderWidth=2
    )
    nodes_json.append({
        "id": chapter_id,
        "name": chapter_name,
        "area": data.loc[idx, "area"],
        "subject_id": data.loc[idx, "f_subject_id"],
        "color": color,
        "border_color": border_color
    })

# 엣지 추가
edges_json = []
for source, target in flow_edges:
    graph.add_edge(
        str(source), str(target),
        color="#aaaaaa",  # 흐름 기반 엣지 색상
        width=1,
        title="선후행 관계"
    )
    edges_json.append({
        "source": source,
        "target": target
    })

for source, target in similarity_edges:
    graph.add_edge(
        str(source), str(target),
        color="#ffa500",  # 유사도 기반 엣지 색상
        width=2,
        title="유사도 기반 연결"
    )
    edges_json.append({
        "source": source,
        "target": target
    })

# PyVis 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 범례 추가
legend_html = '''
<div style="position:absolute;top:10px;left:10px;background-color:white;padding:10px;border-radius:5px;box-shadow:0 0 10px rgba(0,0,0,0.5);z-index:1000;font-size:14px;">
  <strong>범례</strong>
  <ul>
    <li style="color:#FFCCCC;">수와 연산</li>
    <li style="color:#CCCCFF;">변화와 관계</li>
    <li style="color:#CCFFCC;">도형과 측정</li>
    <li style="color:#FFFFCC;">자료와 가능성</li>
  </ul>
  <ul>
    <li style="color:#FF0000;">1학년 1학기</li>
    <li style="color:#0000FF;">1학년 2학기</li>
    <li style="color:#00FF00;">2학년 1학기</li>
    <li style="color:#FFFF00;">2학년 2학기</li>
  </ul>
</div>
'''
net.html = legend_html + net.html

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true,
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.005,
      "springLength": 100,
      "springConstant": 0.04,
      "damping": 0.4,
      "avoidOverlap": 1
    },
    "solver": "forceAtlas2Based",
    "stabilization": {
      "enabled": true,
      "iterations": 2000,
      "updateInterval": 25
    }
  },
  "nodes": {
    "shape": "dot",
    "size": 25,
    "font": {
      "size": 14
    }
  },
  "edges": {
    "smooth": {
      "type": "continuous",
      "forceDirection": "none"
    },
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    }
  }
}
''')

# 그래프 저장
output_file_html = "final_chapter_graph_with_legend.html"
net.write_html(output_file_html)
print(f"\nGraph with legend and clustering saved to {output_file_html}")


  from tqdm.autonotebook import tqdm, trange



Graph with legend and clustering saved to final_chapter_graph_with_legend.html


### 학년학기랑 영역구분 색상 없애고, json파일 저장
### json 파일에 소단원, 토픽단원까지 포함

In [10]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pyvis.network import Network
import json
import numpy as np

# Load your data
file_path = "merged_final_data_new.csv" 
data = pd.read_csv(file_path)

# Ensure required columns are present
required_columns = [
    "f_mchapter_id", "f_mchapter_nm", "f_subject_id", "성취기준코드", "성취기준 내용",
    "핵심키워드_v2", "성취수준 A", "성취수준 B", "성취수준 C", "f_schapter_id", "f_schapter_nm", "f_tchapter_id", "f_tchapter_nm"
]
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Required column '{col}' is missing from the data")

# Preprocess data
data["f_mchapter_nm"] = data["f_mchapter_nm"].str.strip().str.replace(r"[^가-힣a-zA-Z0-9\\s]", "", regex=True)
data["bert_text"] = (
    data["성취기준 내용"].fillna("") + " " +
    data["성취수준 A"].fillna("") + " " +
    data["성취수준 B"].fillna("") + " " +
    data["성취수준 C"].fillna("")
)
data["tfidf_text"] = data["핵심키워드_v2"].fillna("")

# Sentence-BERT similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = model.encode(data["bert_text"].tolist(), convert_to_tensor=True)
bert_sim = cosine_similarity(bert_embeddings.cpu(), bert_embeddings.cpu())

# TF-IDF similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data["tfidf_text"])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Combine similarities
final_sim = 0.8 * bert_sim + 0.2 * tfidf_sim

# 흐름 기반 엣지 생성
flow_edges = []
for i in range(len(data) - 1):
    current_row = data.iloc[i]
    next_row = data.iloc[i + 1]
    if (current_row["f_subject_id"] < next_row["f_subject_id"] or
        (current_row["f_subject_id"] == next_row["f_subject_id"] and
         current_row["성취기준코드"] < next_row["성취기준코드"])):
        flow_edges.append((current_row["f_mchapter_id"], next_row["f_mchapter_id"]))

# 유사도 기반 관계 생성
threshold = 0.85
similarity_edges = []
node_max_connections = {}
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if final_sim[i, j] >= threshold:
            source_id = data.iloc[i]["f_mchapter_id"]
            target_id = data.iloc[j]["f_mchapter_id"]
            if source_id != target_id:
                if source_id not in node_max_connections or node_max_connections[source_id][1] < final_sim[i, j]:
                    node_max_connections[source_id] = (target_id, final_sim[i, j])

for source_id, (target_id, weight) in node_max_connections.items():
    similarity_edges.append((source_id, target_id))

# Combine edges
final_edges = list(set(flow_edges + similarity_edges))

# 그래프 생성
graph = nx.DiGraph()

# 노드 추가
nodes_json = []
for idx, (chapter_id, chapter_name, subject_id, code, schapter_id, schapter_name, tchapter_id, tchapter_name) in enumerate(zip(
    data["f_mchapter_id"], data["f_mchapter_nm"], data["f_subject_id"], data["성취기준코드"],
    data["f_schapter_id"], data["f_schapter_nm"], data["f_tchapter_id"], data["f_tchapter_nm"]
)):
    graph.add_node(
        str(chapter_id),
        label=chapter_name,
        size=30,
        borderWidth=2
    )
    nodes_json.append({
        "id": chapter_id,
        "name": chapter_name,
        "f_subject_id": subject_id,
        "성취기준코드": code,
        "f_schapter_id": schapter_id,
        "f_schapter_name": schapter_name,
        "f_tchapter_id": tchapter_id,
        "f_tchapter_name": tchapter_name
    })

# 엣지 추가
edges_json = []
for source, target in final_edges:
    if str(source) in graph.nodes and str(target) in graph.nodes and source != target:
        graph.add_edge(
            str(source), str(target),
            color="#aaaaff",
            width=1,
            title="선후행 관계 및 유사도 기반 연결"
        )
        edges_json.append({
            "source": source,
            "target": target
        })

# PyVis 시각화
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")
net.from_nx(graph)

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true,
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.005,
      "springLength": 100,
      "springConstant": 0.04,
      "damping": 0.4,
      "avoidOverlap": 1
    },
    "solver": "forceAtlas2Based",
    "stabilization": {
      "enabled": true,
      "iterations": 2000,
      "updateInterval": 25
    }
  },
  "nodes": {
    "shape": "dot",
    "size": 25,
    "font": {
      "size": 14
    }
  },
  "edges": {
    "smooth": {
      "type": "continuous",
      "forceDirection": "none"
    },
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    }
  }
}
''')

# 그래프 저장
output_file_html = "final_chapter_graph_clustered5.html"
net.write_html(output_file_html)

# JSON 데이터 생성
graph_json = {
    "nodes": nodes_json,
    "edges": edges_json
}

# Ensure all non-serializable values are converted to serializable types
def convert_values_to_serializable(obj):
    if isinstance(obj, list):
        return [convert_values_to_serializable(item) for item in obj]
    elif isinstance(obj, dict):
        return {k: convert_values_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.float64, np.float32)):
        return float(obj)
    else:
        return obj

graph_json = convert_values_to_serializable(graph_json)

# JSON 파일로 저장
output_json_file = "graph_data1.json"
with open(output_json_file, "w", encoding="utf-8") as f:
    json.dump(graph_json, f, ensure_ascii=False, indent=4)

print(f"\nGraph data saved to {output_json_file}")
print(f"\nGraph with embedding-based clustering saved to {output_file_html}")



Graph data saved to graph_data1.json

Graph with embedding-based clustering saved to final_chapter_graph_clustered5.html


In [11]:
from pyvis.network import Network
import json

# JSON 파일 로드
input_json_file = "graph_data1.json"  # 생성된 JSON 파일 경로
with open(input_json_file, "r", encoding="utf-8") as f:
    graph_data = json.load(f)

# PyVis 네트워크 생성
net = Network(height="100vh", width="100vw", directed=True, bgcolor="#ffffff", font_color="#000000")

# 노드 추가
for node in graph_data["nodes"]:
    net.add_node(
        str(node["id"]),
        label=node["name"],
        size=30,
        borderWidth=2
    )

# 엣지 추가
for edge in graph_data["edges"]:
    net.add_edge(
        str(edge["source"]),
        str(edge["target"]),
        color="#aaaaff",
        width=1,
        title="선후행 관계 및 유사도 기반 연결"
    )

# 물리적 레이아웃 설정
net.set_options('''
{
  "physics": {
    "enabled": true,
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.005,
      "springLength": 100,
      "springConstant": 0.04,
      "damping": 0.4,
      "avoidOverlap": 1
    },
    "solver": "forceAtlas2Based",
    "stabilization": {
      "enabled": true,
      "iterations": 2000,
      "updateInterval": 25
    }
  },
  "nodes": {
    "shape": "dot",
    "size": 25,
    "font": {
      "size": 14
    }
  },
  "edges": {
    "smooth": {
      "type": "continuous",
      "forceDirection": "none"
    },
    "arrows": {
      "to": {
        "enabled": true,
        "scaleFactor": 0.5
      }
    }
  }
}
''')

# HTML 파일로 저장
output_html_file = "reconstructed_graph.html"
net.write_html(output_html_file)
print(f"\nGraph reconstructed and saved to {output_html_file}")



Graph reconstructed and saved to reconstructed_graph.html
