In [3]:
import pandas as pd
import json

# Load the CSV file
file_path = 'Lunyu Complete Analysis.csv'
df = pd.read_csv(file_path)

# Initialize an empty list to store the corpus data
corpus_data = []

# Define the number of rows in a set
rows_per_set = 6

# Loop through the CSV file in steps of rows_per_set to process each set
for start_index in range(0, len(df), rows_per_set):
    # Check if the set is complete
    if start_index + rows_per_set > len(df):
        break
    
    # Extract the data for one set
    id_value = str(df.iloc[start_index, 0])
    meaning_value = str(df.iloc[start_index + 4, 0]) if pd.notna(df.iloc[start_index + 4, 0]) else ""
    
    # Extract the text details
    text_data = []
    for col_index in range(1, df.shape[1]):
        if pd.notna(df.iloc[start_index + 1, col_index]):
            text_data.append({
                "unicode": str(df.iloc[start_index, col_index]),
                "character": str(df.iloc[start_index + 1, col_index]),
                "pronunciation": str(df.iloc[start_index + 2, col_index]),
                "chinese": str(df.iloc[start_index + 3, col_index])
            })

    # Append the extracted set to the corpus data list
    corpus_data.append({
        "id": id_value,
        "metadata": {
            "meaning": meaning_value
        },
        "text": text_data
    })

# Convert the corpus data to JSON format
json_output = {
    "corpus_data": corpus_data
}

# Save the JSON data to a file
output_path = 'Lunyu_Corpus.json'
with open(output_path, 'w', encoding='utf-8') as json_file:
    json.dump(json_output, json_file, ensure_ascii=False, indent=2)

print(f"JSON data has been saved to {output_path}")


JSON data has been saved to Lunyu_Corpus.json


In [6]:
import pandas as pd

# Load the CSV file
file_path = 'Lunyu Complete Analysis.csv'
df = pd.read_csv(file_path)

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Define the number of rows in a set
rows_per_set = 6

# Process each set
for start_index in range(0, len(df), rows_per_set):
    # Check if the set is complete
    if start_index + rows_per_set > len(df):
        break
    
    # Combine the rows by concatenating them horizontally
    combined_row = pd.concat([df.iloc[start_index + i] for i in range(rows_per_set)], axis=0).reset_index(drop=True)
    
    # Append the combined row to the DataFrame
    combined_df = pd.concat([combined_df, combined_row.to_frame().T], ignore_index=True)




In [7]:
# Save the combined DataFrame to a new CSV file
output_combined_csv_path = 'Combined_Lunyu.csv'
combined_df.to_csv(output_combined_csv_path, index=False)

print(f"Combined CSV data has been saved to {output_combined_csv_path}")


Combined CSV data has been saved to Combined_Lunyu.csv


In [8]:
import pandas as pd

# Load the CSV file
file_path = 'Lunyu Complete Analysis.csv'
df = pd.read_csv(file_path, header=None)  # No header, as each row is part of a set

# Initialize an empty list to store combined rows
combined_data = []

# Define the number of rows in a set
rows_per_set = 6

# Process each set
for start_index in range(0, len(df), rows_per_set):
    # Check if the set is complete
    if start_index + rows_per_set > len(df):
        break
    
    # Extract and combine the 6 rows into a single row
    combined_row = []
    for i in range(rows_per_set):
        combined_row.extend(df.iloc[start_index + i].tolist())
    
    # Append the combined row to the list
    combined_data.append(combined_row)

# Create a DataFrame from the combined data
combined_df = pd.DataFrame(combined_data)

# Display the first few rows of the combined DataFrame
print(combined_df.head())

# Save the combined DataFrame to a new CSV file
output_combined_csv_path = '2Combined_Lunyu.csv'
combined_df.to_csv(output_combined_csv_path, index=False, header=False)

print(f"Combined CSV data has been saved to {output_combined_csv_path}")


                    0       1     2    3    4     5     6       7      8    \
0               0316b1      NaN   NaN  NaN  NaN   NaN   NaN     NaN    NaN   
1                   NaN     NaN   NaN  NaN  NaN   NaN   NaN     NaN    NaN   
2  不為過矣。蓋有文子之質，再斯可矣。無文子     NaN   NaN  NaN  NaN   NaN   NaN     NaN    NaN   
3                     如       質    無。    三    思     非    則；       不     可。   
4                  ljɨ1  lhjwɨ2  ŋwu2  kụ2  ku1  mji1  źwe1  nioow1  źier1   

    9    ...   98     99    100   101     102  103   104    105  106  107  
0   NaN  ...   NaN    NaN   NaN   NaN     NaN  NaN   NaN    NaN  NaN  NaN  
1   NaN  ...     質     有。     再    則；       可   也。   [文]    [子]  NaN  NaN  
2   NaN  ...  lew1  goor1   no2   sọ1  sjiij2  do2   ku1  ljɨɨ1  NaN  NaN  
3   孔　子  ...     𗖣      𘃞     𘍞     𘓐       𗖵    𘘝     𘃡      𗡶  NaN  NaN  
4  ljɨ1  ...  4925   4612  3433  3849    4859  197  4310   1718  NaN  NaN  

[5 rows x 108 columns]
Combined CSV data has been saved to 2Combined_Lunyu

In [9]:
combined_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0316b1,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,質,有。,再,則；,可,也。,[文],[子],,
2,不為過矣。蓋有文子之質，再斯可矣。無文子,,,,,,,,,,...,lew1,goor1,no2,sọ1,sjiij2,do2,ku1,ljɨɨ1,,
3,如,質,無。,三,思,非,則；,不,可。,孔　子,...,𗖣,𘃞,𘍞,𘓐,𗖵,𘘝,𘃡,𗡶,,
4,ljɨ1,lhjwɨ2,ŋwu2,kụ2,ku1,mji1,źwe1,nioow1,źier1,ljɨ1,...,4925,4612,3433,3849,4859,197,4310,1718,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,,人,君,天,於,則；,猶,{子,父,之於},...,𗖵,𘎪,𗌭,𘓺,𗂸,𘘥,𘕋,𘂬,𗆪,
431,,bju1,tshjiij1,ku1,mə1,zjɨ̣1,·jɨ2,wo2,bju1,tshjiij1,...,460,1553,1491,3125,3612,1824,972,4853,2778,
432,,𘖑,𘜕,𗅋,𗹑,𘖑,𘜕,𘟂,𘓺,𗂸,...,,,,,,,,,,
433,,3510,4979,3687,4391,1824,2386,4329,5615,1348,...,,,,,,,,,,


In [11]:
import csv

def transform_csv(input_file, output_file):
    with open(input_file, newline='', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # 初期化
        combined_rows = [[] for _ in range(6)]
        row_index = 0
        
        for row in reader:
            if not any(row):  # 空行の検出
                row_index = 0
                continue
            
            combined_rows[row_index].extend(row)
            row_index += 1
            
            # 各セットは6行で1セットなので、6行目の後は次のセットに移る
            if row_index == 6:
                row_index = 0

        # 変換した行を書き出す
        for row in combined_rows:
            writer.writerow(row)

# 使用例
input_file = 'Lunyu Complete Analysis.csv'
output_file = 'output2.csv'
transform_csv(input_file, output_file)

print(f"Transformed CSV saved to {output_file}")


Transformed CSV saved to output2.csv


In [12]:
import csv
import json

def csv_to_json(input_file, output_file):
    with open(input_file, newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        rows = list(reader)
    
    # JSONデータを保持するリスト
    corpus_data = []
    
    # 各セット（6行ずつ）を処理
    num_sets = len(rows[0]) // 4  # 各セットが4列（id, meaning, text, pronunciation）で構成されると仮定
    
    for i in range(num_sets):
        # id列のインデックス
        idx = i * 4
        
        corpus_entry = {
            "id": rows[0][idx],
            "metadata": {
                "meaning": rows[5][idx]  # 最後の行がmeaning
            },
            "text": []
        }
        
        for j in range(4):
            text_entry = {
                "unicode": rows[1][idx + j],
                "character": rows[2][idx + j],
                "pronunciation": rows[3][idx + j],
                "chinese": rows[4][idx + j]
            }
            corpus_entry["text"].append(text_entry)
        
        corpus_data.append(corpus_entry)
    
    # JSONファイルとして保存
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump({"corpus_data": corpus_data}, outfile, ensure_ascii=False, indent=4)

# 使用例
input_file = 'test_row_data.csv'
output_file = 'output.json'
csv_to_json(input_file, output_file)

print(f"Transformed JSON saved to {output_file}")


Transformed JSON saved to output.json


In [14]:
import csv
import json

def csv_to_json(input_file, output_file):
    with open(input_file, newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        rows = list(reader)
    
    # JSONデータを保持するリスト
    corpus_data = []
    
    # 現在処理中のセットを保持する変数
    current_set = {}
    text_entries = []

    for i, row in enumerate(rows):
        if not any(row):  # 空行の検出
            if current_set:  # 既存のセットがある場合
                current_set["text"] = text_entries
                corpus_data.append(current_set)
                current_set = {}
                text_entries = []
            continue
        
        # 行の番号に基づいてセットを作成
        if i % 7 == 0:
            current_set["id"] = row[0]
        elif i % 7 == 1:
            unicode_list = row
        elif i % 7 == 2:
            character_list = row
        elif i % 7 == 3:
            pronunciation_list = row
        elif i % 7 == 4:
            chinese_list = row
        elif i % 7 == 5:
            current_set["metadata"] = {"meaning": row[0]}
            # 各テキストエントリーを作成
            for u, c, p, ch in zip(unicode_list, character_list, pronunciation_list, chinese_list):
                text_entries.append({
                    "unicode": u,
                    "character": c,
                    "pronunciation": p,
                    "chinese": ch
                })
    
    # 最後のセットを追加
    if current_set:
        current_set["text"] = text_entries
        corpus_data.append(current_set)
    
    # JSONファイルとして保存
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump({"corpus_data": corpus_data}, outfile, ensure_ascii=False, indent=4)

# 使用例
input_file = 'Lunyu Complete Analysis.csv'
output_file = 'output3.json'
csv_to_json(input_file, output_file)

print(f"Transformed JSON saved to {output_file}")


Transformed JSON saved to output3.json


In [15]:
import csv
import json

def csv_to_json(input_file, output_file):
    with open(input_file, newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        rows = list(reader)
    
    corpus_data = []  # JSONデータを保持するリスト
    
    current_set = {}  # 現在処理中のセットを保持する変数
    text_entries = []  # テキストエントリを保持するリスト

    for i, row in enumerate(rows):
        if not any(row):  # 空行の検出
            if current_set:  # 既存のセットがある場合
                # 空のエントリを除外
                current_set["text"] = [entry for entry in text_entries if any(entry.values())]
                corpus_data.append(current_set)
                current_set = {}
                text_entries = []
            continue
        
        if i % 7 == 0:
            current_set["id"] = row[0]
        elif i % 7 == 1:
            unicode_list = row
        elif i % 7 == 2:
            character_list = row
        elif i % 7 == 3:
            pronunciation_list = row
        elif i % 7 == 4:
            chinese_list = row
        elif i % 7 == 5:
            current_set["metadata"] = {"meaning": row[0]}
            for u, c, p, ch in zip(unicode_list, character_list, pronunciation_list, chinese_list):
                entry = {
                    "unicode": u,
                    "character": c,
                    "pronunciation": p,
                    "chinese": ch
                }
                text_entries.append(entry)
    
    if current_set:  # 最後のセットを追加
        current_set["text"] = [entry for entry in text_entries if any(entry.values())]
        corpus_data.append(current_set)
    
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump({"corpus_data": corpus_data}, outfile, ensure_ascii=False, indent=4)

# 使用例
input_file = 'Lunyu Complete Analysis.csv'
output_file = 'output4.json'
csv_to_json(input_file, output_file)

print(f"Transformed JSON saved to {output_file}")


Transformed JSON saved to output4.json
