In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

intreday_omega = r"C:\\physics_masters\\data\\Bossa public market data\\pub\\intraday\\omega"
daily_omega = r"C:\\physics_masters\\data\\Bossa public market data\\pub\\daily\\omega"

intraday_mstock = r"C:\\physics_masters\\data\\Bossa public market data\\pub\\intraday\\mstock"
daily_mstock = r"C:\\physics_masters\\data\\Bossa public market data\\pub\\daily\\mstock"

# Unzip

In [33]:
data_dir = r"C:\\physics_masters\\data\\Bossa public market data\\pub\\intraday\\mstock\\fut"
output_dir = r"C:\\physics_masters\\data\\Bossa public market data\\pub\\intraday_unzip\\mstock"

for filename in os.listdir(data_dir):
    zip_path = os.path.join(data_dir, filename)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)

In [None]:
# additional processing for mstock - remove second column, add header

data_dir = r"C:\\physics_masters\\data\\Bossa public market data\\pub\\intraday_unzip\\mstock"
output_dir = r"C:\\physics_masters\\data\\Bossa public market data\\pub\\intraday_unzip\\mstock"

os.makedirs(output_dir, exist_ok=True)

header = ["Name", "Date", "Time", "Open", "High", "Low", "Close", "Volume", "Oi"]

for filename in os.listdir(data_dir):
    if filename.endswith(".prn"):
        file_path = os.path.join(data_dir, filename)
        df = pd.read_csv(file_path, delimiter=',', header=None)
        df.drop(columns=[1], inplace=True)  # Remove the second column
        df.to_csv(os.path.join(output_dir, filename), index=False, header=header)

# Create daily data

In [2]:
def create_daily_data(data_dir, output_dir):
    dataframes = dict()

    for filename in os.listdir(data_dir):
        path = os.path.join(data_dir, filename)
        df = pd.read_csv(path, delimiter=',', header=0)
        df["Date"] = pd.to_datetime(df["Date"], format="%Y%m%d")
        # daily_diff = df.groupby("Date").agg({"Low": "first", "High": "last"})
        # daily_diff["DateDiff"] = daily_diff["High"] - daily_diff["Low"]
        #daily_diff = daily_diff[["DateDiff"]]
        
        agg = {
            "Open": "first",
            "High": "max",
            "Low": "min",
            "Close": "last",
            "Volume": "sum",
            "Oi": "sum"
        }

        daily = df.groupby("Date").agg(agg)
        #daily["DiffPrevDate"] = daily["Close"].diff()
        
        # standarisation - so far without the 30 day std
        #daily["DiffPrevDate"] = (daily["DiffPrevDate"] - daily["DiffPrevDate"].mean()) / daily["DiffPrevDate"].std()
        
        #daily['logClose'] = np.log(daily['Close'])

        output_file = os.path.join(output_dir, filename)
        daily.to_csv(output_file)
        dataframes[filename.replace('.prn', '')]=len(daily)
    
    df = pd.DataFrame(dataframes.items(), columns=['Asset', 'TotalDays'])
    df.to_csv(rf'C:\\physics_masters\\data\\{output_dir.split("\\")[-1]}_summary.txt', sep='\t', index=False)
    

In [3]:
create_daily_data(intreday_omega, daily_omega)
create_daily_data(intraday_mstock, daily_mstock)

In [None]:
#info about the daily data: how many entries etc

for data_dir in [daily_omega, daily_mstock]:
    assets_info = []
    for filename in os.listdir(data_dir):
        path = os.path.join(data_dir, filename)
        
        #df = pd.read_csv(path, delimiter=',', header=0)
        df = pd.read_csv(path, delimiter=',', header=0, parse_dates=['Date'])
        
        assets_info.append([filename.replace('.prn', '').ljust(10),  
                            pd.to_datetime(df['Date'], format='%Y%m%d').min().strftime('%Y-%m-%d'), 
                            pd.to_datetime(df['Date'], format='%Y%m%d').max().strftime('%Y-%m-%d'),
                            len(df),
                            (pd.to_datetime(df['Date'], format='%Y%m%d').max() - pd.to_datetime(df['Date'], format='%Y%m%d').min()).days*(5/7)])

    df = pd.DataFrame(assets_info, columns=['Asset',  'DateMin', 'DateMax', 'TotalDays','DateDiffDays'])
    df.to_csv(rf'C:\\physics_masters\\data\\{data_dir.split("\\")[-1]}_summary.txt', sep='\t', index=False,)    



# others


In [11]:
def get_dataframes(data_dir):
    dataframes = dict()
    #dataframes = []
    for filename in os.listdir(data_dir):
        path = os.path.join(data_dir, filename)
        df = pd.read_csv(path, delimiter=',', header=0, parse_dates=['Date'])
        dataframes[filename.replace('.prn', '')] = df
        #dataframes.append(df)
    return dataframes