In [18]:
import os
import pandas as pd
from datetime import datetime
import pickle
import chardet
import pickle

# options
pd.set_option('display.expand_frame_repr', False) # 単一列を省略しない

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

def load_data(parent_folder):
    data = []
    subfolders = sorted([f for f in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, f))])

    for subfolder in subfolders:
        subfolder_path = os.path.join(parent_folder, subfolder)
        csv_files = sorted([f for f in os.listdir(subfolder_path) if f.endswith('.csv')])

        for csv_file in csv_files:
            file_path = os.path.join(subfolder_path, csv_file)
            encoding = detect_encoding(file_path)
            temp_df = pd.read_csv(file_path, encoding=encoding)
            data.append(temp_df)

    return pd.concat(data, ignore_index=True) if data else pd.DataFrame()

def save_pickle(df, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

parent_folder = 'add'
pickle_file = 'data.pickle'

if os.path.exists(pickle_file):
    df = load_pickle(pickle_file)
else:
    df = pd.DataFrame()

new_data = load_data(parent_folder)

if not new_data.empty:
    print("Before adding new data:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns}\n")

    df = pd.concat([df, new_data], ignore_index=True)
    df.drop_duplicates(subset=['日時'], inplace=True)  # Remove duplicates based on the '日時' column
    df.sort_values(by=['日時'], inplace=True)  # Sort the DataFrame based on the '日時' column
    df.reset_index(drop=True, inplace=True)  # Reset the index
    save_pickle(df, pickle_file)

    print("After adding new data:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns}\n")
else:
    print("No new data to add.")

print("Summary of data.pickle:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}\n")

print("Head of the loaded DataFrame:")
print(df.head())

Before adding new data:
Shape: (1114013, 9)
Columns: Index(['日時', '始値(BID)', '高値(BID)', '安値(BID)', '終値(BID)', '始値(ASK)', '高値(ASK)',
       '安値(ASK)', '終値(ASK)'],
      dtype='object')

After adding new data:
Shape: (1456560, 9)
Columns: Index(['日時', '始値(BID)', '高値(BID)', '安値(BID)', '終値(BID)', '始値(ASK)', '高値(ASK)',
       '安値(ASK)', '終値(ASK)'],
      dtype='object')

Summary of data.pickle:
Shape: (1456560, 9)
Columns: Index(['日時', '始値(BID)', '高値(BID)', '安値(BID)', '終値(BID)', '始値(ASK)', '高値(ASK)',
       '安値(ASK)', '終値(ASK)'],
      dtype='object')

Head of the loaded DataFrame:
             日時  始値(BID)  高値(BID)  安値(BID)  終値(BID)  始値(ASK)  高値(ASK)  安値(ASK)  終値(ASK)
0  201901020800   2507.9   2508.7   2506.3   2508.7   2508.2   2509.1   2506.6   2509.0
1  201901020801   2508.7   2512.0   2508.5   2512.0   2509.0   2512.6   2509.0   2512.3
2  201901020802   2512.0   2516.8   2511.5   2516.0   2512.3   2517.4   2511.8   2516.5
3  201901020803   2516.2   2516.4   2514.9   2514.9   2516.5   2