# Part 1. Data Clean

### Data is coming from around 400 stocks of miniute-level information from 2023/4/1 to 2025/4/1.

### Check the exact numbers of csv files we have 

In [1]:
import os
files = os.listdir(r'C:\Users\Username\OneDrive\Desktop\810project')
print(f"Total csv files: {len([f for f in files if f.endswith('.csv')])}")

Total csv files: 429


### Merge all csv to one for the convenience of following processes

In [None]:
import pandas as pd

path = r'C:\Users\Username\OneDrive\Desktop\810project'
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]

all_data = []
for file in csv_files:
    df = pd.read_csv(os.path.join(path, file))
    ticker = file.replace(".csv", "")
    df["ticker"] = ticker
    all_data.append(df)

merged_df = pd.concat(all_data, ignore_index=True)
merged_df.to_csv(r'C:\Users\Username\OneDrive\Desktop\810project\merged_raw.csv', index=False)

### Too slow, transfrom to Parquet to improve efficiency

In [3]:
merged_df.to_parquet(r'C:\Users\Username\OneDrive\Desktop\810project\merged_raw.parquet', index=False)

df_par = pd.read_parquet(r'C:\Users\Username\OneDrive\Desktop\810project\merged_raw.parquet')

### 这里上交之前记得删了 人为制造missing value

In [None]:
import numpy as np

np.random.seed(42)

# 想引入缺失的列范围（你也可以包含't' 或 'n'）
target_cols = ['o', 'h', 'l', 'c', 'v', 'vw']
row_indices = df_par.sample(frac=0.1).index  # 1% 的行

for idx in row_indices:
    n_missing = np.random.randint(1, 4)  # 每行随机1~3个列置为空
    cols_to_null = np.random.choice(target_cols, size=n_missing, replace=False)
    df_par.loc[idx, cols_to_null] = np.nan


In [None]:
df_par.to_parquet(r'C:\Users\Username\OneDrive\Desktop\810project\mermiss_raw.parquet')


In [4]:
import pandas as pd
df_par = pd.read_parquet(r'C:\Users\Username\OneDrive\Desktop\810project\mermiss_raw.parquet')

### Simply view the data

In [11]:
df_par.shape

(168836922, 10)

In [12]:
df_par.head(10)

Unnamed: 0,v,vw,o,c,h,l,t,n,datetime,ticker
0,17632.0,137.0174,137.42,137.26,137.565,136.85,1680528600000,105,2023-04-03 13:30:00,A
1,1011.0,137.253,137.255,137.32,137.32,137.25,1680528660000,28,2023-04-03 13:31:00,A
2,697.0,137.2364,137.22,137.33,137.33,136.95,1680528720000,18,2023-04-03 13:32:00,A
3,4004.0,137.3055,136.99,137.57,137.57,136.99,1680528780000,79,2023-04-03 13:33:00,A
4,7244.0,137.4399,137.69,137.3784,137.72,137.11,1680528840000,107,2023-04-03 13:34:00,A
5,2483.0,136.7879,137.215,136.61,137.215,136.61,1680528900000,74,2023-04-03 13:35:00,A
6,4375.0,136.4643,136.53,136.715,136.715,136.34,1680528960000,72,2023-04-03 13:36:00,A
7,4552.0,136.6168,136.67,136.47,136.87,136.28,1680529020000,80,2023-04-03 13:37:00,A
8,1157.0,136.4493,136.32,136.445,136.465,136.32,1680529080000,13,2023-04-03 13:38:00,A
9,4688.0,136.6723,136.58,136.98,136.98,136.55,1680529140000,71,2023-04-03 13:39:00,A


In [13]:
df_par.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168836922 entries, 0 to 168836921
Data columns (total 10 columns):
 #   Column    Dtype  
---  ------    -----  
 0   v         float64
 1   vw        float64
 2   o         float64
 3   c         float64
 4   h         float64
 5   l         float64
 6   t         int64  
 7   n         int64  
 8   datetime  object 
 9   ticker    object 
dtypes: float64(6), int64(2), object(2)
memory usage: 12.6+ GB


In [14]:
df_par.isna().sum()

v           209
vw          190
o           218
c           198
h           216
l           219
t             0
n             0
datetime      0
ticker        0
dtype: int64

### Rename the columns name for better viewing

In [15]:
df_par = df_par.rename(columns={
    'v': 'volume',
    'vw': 'vwap',
    'o': 'open',
    'c': 'close',
    'h': 'high',
    'l': 'low',
    't': 'timestamp',
    'n': 'transactions'
})
df_par.head()

Unnamed: 0,volume,vwap,open,close,high,low,timestamp,transactions,datetime,ticker
0,17632.0,137.0174,137.42,137.26,137.565,136.85,1680528600000,105,2023-04-03 13:30:00,A
1,1011.0,137.253,137.255,137.32,137.32,137.25,1680528660000,28,2023-04-03 13:31:00,A
2,697.0,137.2364,137.22,137.33,137.33,136.95,1680528720000,18,2023-04-03 13:32:00,A
3,4004.0,137.3055,136.99,137.57,137.57,136.99,1680528780000,79,2023-04-03 13:33:00,A
4,7244.0,137.4399,137.69,137.3784,137.72,137.11,1680528840000,107,2023-04-03 13:34:00,A


### We are starting to deal with the missing values

### Since the data is min-level stock info, for the price columns, we use ffill to deal with
### Starting as a quant, we need to ensure the truth of data, so just apply ffill

In [16]:
# define the price cols 
price_cols = ['open', 'close', 'high', 'low']

# use for loop to ffill
for col in price_cols:
    df_par[col] = df_par.groupby('ticker')[col].ffill()

# check again the data
df_par.isna().sum()

volume          209
vwap            190
open              0
close             0
high              0
low               0
timestamp         0
transactions      0
datetime          0
ticker            0
dtype: int64

### For the volume, just fill 0 since it does not matter
### For the volume weighted average price, we apply linear interpoate here

In [17]:
df_par['volume'] = df_par['volume'].fillna(0)

# the vwap need to interpolate in the same ticker
df_par['vwap'] = df_par.groupby('ticker')['vwap'].transform(lambda group: group.interpolate(method='linear'))

df_par.isna().sum()

volume          0
vwap            0
open            0
close           0
high            0
low             0
timestamp       0
transactions    0
datetime        0
ticker          0
dtype: int64

### Now the data seems good!

### Save the clean df so that we do not need to clean again!

In [18]:
df_par.to_parquet(r'C:\Users\Username\OneDrive\Desktop\810project\merged_cleaned.parquet', index=False)

In [5]:
import pandas as pd
df_par = pd.read_parquet(r'C:\Users\Username\OneDrive\Desktop\810project\merged_cleaned.parquet')

# Spark/ Spark RDD/ Mapreduce/ Spark df& SQL 待涉及

# Part 2: Feature Engineering

### Time Features

In [6]:
df_par["datetime"] = pd.to_datetime(df_par["datetime"])

df_par["minute"] = df_par["datetime"].dt.minute
df_par["hour"] = df_par["datetime"].dt.hour
df_par["dayofweek"] = df_par["datetime"].dt.dayofweek
df_par["is_open_hour"] = df_par["hour"].between(9, 16)

### Price Features

In [9]:
import numpy as np
df_par["hl_spread"] = df_par["high"] - df_par["low"]
df_par["oc_return"] = (df_par["close"] - df_par["open"]) / df_par["open"]

df_par["log_return"] = (
    df_par.groupby("ticker")["close"]
    .apply(lambda x: np.log(x / x.shift(1)))
    .reset_index(drop=True)
)


### Lag Features

In [10]:

df_par["close_lag1"] = (
    df_par.groupby("ticker")["close"]
    .shift(1)
    .reset_index(drop=True)
)


df_par["close_roll_mean_5"] = (
    df_par.groupby("ticker")["close"]
    .transform(lambda x: x.rolling(5).mean())
    .reset_index(drop=True)
)


df_par["volume_roll_std_15"] = (
    df_par.groupby("ticker")["volume"]
    .transform(lambda x: x.rolling(15).std())
    .reset_index(drop=True)
)

In [11]:
df_par.head()

Unnamed: 0,volume,vwap,open,close,high,low,timestamp,transactions,datetime,ticker,minute,hour,dayofweek,is_open_hour,hl_spread,oc_return,log_return,close_lag1,close_roll_mean_5,volume_roll_std_15
0,17632.0,137.0174,137.42,137.26,137.565,136.85,1680528600000,105,2023-04-03 13:30:00,A,30,13,0,True,0.715,-0.001164,,,,
1,1011.0,137.253,137.255,137.32,137.32,137.25,1680528660000,28,2023-04-03 13:31:00,A,31,13,0,True,0.07,0.000474,0.000437,137.26,,
2,697.0,137.2364,137.22,137.33,137.33,136.95,1680528720000,18,2023-04-03 13:32:00,A,32,13,0,True,0.38,0.000802,7.3e-05,137.32,,
3,4004.0,137.3055,136.99,137.57,137.57,136.99,1680528780000,79,2023-04-03 13:33:00,A,33,13,0,True,0.58,0.004234,0.001746,137.33,,
4,7244.0,137.4399,137.69,137.3784,137.72,137.11,1680528840000,107,2023-04-03 13:34:00,A,34,13,0,True,0.61,-0.002263,-0.001394,137.57,137.37168,


In [12]:
df_par.to_parquet(r'C:\Users\Username\OneDrive\Desktop\810project\dffeatures.parquet', index=False)