## **1. compound에 대한 SMILES 정보 불러와서 하나의 테이블로 병합**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
compound_path = '/content/drive/MyDrive/compound.xlsx'
content_path = '/content/drive/MyDrive/Content_new.xlsx'

In [None]:
import pandas as pd

# 파일 읽기
df_compound = pd.read_excel(compound_path)
df_content = pd.read_excel(content_path)

# 데이터 일부 확인
print("📄 compound.xlsx 미리보기")
display(df_compound.head())

print("\n📄 Content_new.xlsx 미리보기")
display(df_content.head())


In [None]:
df_compound.rename(columns={'id': 'source_id'}, inplace=True)
df_compound.head()

In [None]:
# 병합 전에 name 컬럼이 있다면 삭제
if 'moldb_smiles' in df_content.columns:
    df_content = df_content.drop(columns=['moldb_smiles'])

# 병합
df_content = df_content.merge(df_compound[['source_id', 'moldb_smiles']], on='source_id', how='left')
df_content.head()

In [None]:
df_content.to_excel('content_combined_smiles.xlsx', index=False)

In [None]:
from google.colab import files
files.download('content_combined_smiles.xlsx')

## **2. 데이터 전처리**

In [None]:
# 데이터 불러오기
import pandas as pd
df_content = pd.read_excel('content_combined_smiles.xlsx')

#### `orig_content` 에 대한 단위 `mg/100g`로 통일

In [None]:
unique_unit = df_content['orig_unit'].dropna().unique()
num_unit = len(unique_unit)
print(unique_unit)

['mg/100g' 'kcal/100g' 'RE' '慣-TE' 'NE' 'IU' '쨉g' 'ug/g' 'uM']


In [None]:
unit_content_map = df_content.groupby('orig_unit')['orig_content'].unique()

for unit, contents in unit_content_map.items():
    print(f"단위: {unit}")
    print(f"해당 orig_content 값들: {len(contents)}\n")

단위: IU
해당 orig_content 값들: 1313

단위: NE
해당 orig_content 값들: 430

단위: RE
해당 orig_content 값들: 737

단위: kcal/100g
해당 orig_content 값들: 2682

단위: mg/100g
해당 orig_content 값들: 29509

단위: uM
해당 orig_content 값들: 1

단위: ug/g
해당 orig_content 값들: 3

단위: 慣-TE
해당 orig_content 값들: 230

단위: 쨉g
해당 orig_content 값들: 1



In [None]:
df_mg_100g = df_content[df_content['orig_unit'] == 'mg/100g']
unique_unit = df_mg_100g['orig_unit'].unique()
num_unit = len(unique_unit)
print(num_unit)

1


#### 분석 시 사용하고자 하는 칼럼들인 `orig_food_common_name`, `moldb_smiles`, `orig_content`에 대해서 결측치 필터링

In [None]:
# moldb_smiles가 NaN인 행 제거
df_clean = df_mg_100g.copy()
# orig_food_common_name, orig_content, orig_unit Nan 행들 제거
df_clean = df_clean[df_clean['orig_food_common_name'].notna()]
df_clean = df_clean[df_clean['moldb_smiles'].notna()]
df_clean = df_clean[df_clean['orig_content'].notna()]
df_clean = df_clean[df_clean['orig_unit'].notna()]
df_clean['orig_food_common_name'] = df_clean['orig_food_common_name'].str.split(',').str[0].str.strip()

print(len(df_clean))
df_clean.head()

433462


Unnamed: 0,id,source_id,source_type,food_id,orig_food_id,orig_food_common_name,orig_food_part,orig_content,orig_unit,name,moldb_smiles
19456,21071,4,Nutrient,2,674,Cabbage,,100.0,mg/100g,Cyanidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...
19457,21072,4,Nutrient,4,465,Kiwi fruit,,800.0,mg/100g,Cyanidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...
19458,21073,4,Nutrient,4,9148,Kiwifruit,,520.0,mg/100g,Cyanidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...
19459,21074,4,Nutrient,4,9445,Kiwifruit,,560.0,mg/100g,Cyanidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...
19460,21075,4,Nutrient,5,148,Onion,,300.0,mg/100g,Cyanidin 3-(6''-acetyl-galactoside),[H][C@]1(COC(C)=O)O[C@@]([H])(OC2=CC3=C(O)C=C(...


In [None]:
df_clean.to_excel('foodb_final.xlsx', index=False)
from google.colab import files
files.download('foodb_final.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **3. food-compound 그래프 생성**

In [None]:
# 데이터 불러오기
import pandas as pd
df_content = pd.read_excel('foodb_final.xlsx')

In [None]:
# 데이터 처리 및 네트워크 분석을 위한 라이브러리 불러오기
import networkx as nx
import pandas as pd
import os
from tqdm import tqdm
from collections import defaultdict

#### food 노드와 compound(SMILES) 노드를 함유량 가중치 엣지로 연결한 이분 그래프 생성

In [None]:
def create_food_molecule_graph(df_clean, output_dir='graph_data'):

    os.makedirs(output_dir, exist_ok=True)
    # 그래프 생성
    G = nx.Graph()

    # 음식 노드 추가
    food_nodes = set(df_clean['orig_food_common_name'])
    print(f"음식 노드 수: {len(food_nodes)}")
    for food in food_nodes:
        G.add_node(food, node_type='food')

    # 분자 노드 추가
    molecule_nodes = set(df_clean['moldb_smiles'])
    print(f"분자 노드 수: {len(molecule_nodes)}")
    for smiles in molecule_nodes:
        G.add_node(str(smiles), node_type='molecule')

    # 엣지 추가
    edge_weights = defaultdict(float)
    edge_counts = defaultdict(int)
    # 데이터 프레임을 순회하며 가중치(compound 함유량) 수집
    for _, row in tqdm(df_clean.iterrows(), total=len(df_clean)):
        food = row['orig_food_common_name']
        smiles = str(row['moldb_smiles'])
        edge_key = (food, smiles)
        content_value = float(row['orig_content'])
        edge_weights[edge_key] += content_value
        edge_counts[edge_key] += 1

    for edge_key, total_weight in edge_weights.items():
        food, smiles = edge_key
        avg_weight = total_weight / edge_counts[edge_key]
        G.add_edge(food, smiles, weight=avg_weight)

    print(f"추가된 엣지 수: {len(edge_weights)}")

    # 노드를 TSV로 저장
    nodes_data = []
    for node, attrs in G.nodes(data=True):
        node_data = {'id': node, 'node_type': attrs.get('node_type', '')}
        nodes_data.append(node_data)

    nodes_df = pd.DataFrame(nodes_data)
    nodes_tsv_path = os.path.join(output_dir, 'food_nodes.tsv')
    nodes_df.to_csv(nodes_tsv_path, sep='\t', index=False)
    print(f"노드가 {nodes_tsv_path}에 저장되었습니다")

    # 엣지를 TSV로 저장
    edges_data = []
    for u, v, attrs in G.edges(data=True):
        edge_data = {
            'source': u,
            'target': v,
            'weight': attrs.get('weight', 1.0)
        }
        edges_data.append(edge_data)

    edges_df = pd.DataFrame(edges_data)
    edges_tsv_path = os.path.join(output_dir, 'food_edges.tsv')
    edges_df.to_csv(edges_tsv_path, sep='\t', index=False)
    print(f"엣지가 {edges_tsv_path}에 저장되었습니다")

    return G

create_food_molecule_graph(df_clean, output_dir='graph_data')

음식 노드 수: 1641
분자 노드 수: 1591


100%|██████████| 433462/433462 [00:19<00:00, 22307.82it/s]


추가된 엣지 수: 79357
노드가 graph_data/nodes.tsv에 저장되었습니다
엣지가 graph_data/edges.tsv에 저장되었습니다


<networkx.classes.graph.Graph at 0x7c3d1074e6d0>