# 1. NVDA OHLC Data Loading & Initial Processing

This notebook loads all OHLC CSV files from the `data/` folder into a single DataFrame and performs initial data processing.


In [1]:
import pandas as pd
import glob
import os
from datetime import datetime

print("Available CSV files:")
# Path to data folder
data_path = 'data'

# Find all relevant CSV files
csv_files = glob.glob(os.path.join(data_path, '*_ohlc_NVDA.csv'))
print(f"Found {len(csv_files)} CSV files")

# Sort files by date to ensure consistent ordering
csv_files.sort()

for file in csv_files[:5]:  # Show first 5 files
    print(file)


Available CSV files:
Found 1107 CSV files
data/20210104_ohlc_NVDA.csv
data/20210105_ohlc_NVDA.csv
data/20210106_ohlc_NVDA.csv
data/20210107_ohlc_NVDA.csv
data/20210108_ohlc_NVDA.csv


In [2]:
# Load and combine all CSV files
dfs = []

for file in csv_files:
    # Extract date from filename (first 8 digits: YYYYMMDD)
    basename = os.path.basename(file)
    date_str = basename.split('_')[0]  # e.g., '20250529' from '20250529_ohlc_NVDA.csv'
    
    # Convert to proper date format
    date_obj = datetime.strptime(date_str, '%Y%m%d').date()
    
    # Read CSV
    df = pd.read_csv(file)
    
    # Add date column
    df['date'] = date_obj
    df['date_str'] = date_str
    
    dfs.append(df)
    
print(f"Loaded {len(dfs)} files")

# Concatenate all DataFrames
all_data = pd.concat(dfs, ignore_index=True)

print(f"Combined DataFrame shape: {all_data.shape}")
print(f"Date range: {all_data['date'].min()} to {all_data['date'].max()}")
print(f"Columns: {list(all_data.columns)}")


Loaded 1107 files
Combined DataFrame shape: (865782, 12)
Date range: 2021-01-04 to 2025-05-30
Columns: ['time', 'timestamp', 'open', 'high', 'low', 'close', 'vwap', 'volume', 'transactions', 'otc', 'date', 'date_str']


In [3]:
# Display sample data
print("First 10 rows:")
print(all_data.head(10))

print("\nData types:")
print(all_data.dtypes)

print("\nSample of unique dates:")
print(sorted(all_data['date'].unique())[:10])


First 10 rows:
                time      timestamp     open     high      low    close  \
0  20210104 04:07:00  1609751220000  13.0800  13.0800  13.0700  13.0700   
1  20210104 04:16:00  1609751760000  13.1200  13.1245  13.1200  13.1245   
2  20210104 04:17:00  1609751820000  13.1223  13.1223  13.1223  13.1223   
3  20210104 04:24:00  1609752240000  13.1250  13.1250  13.1250  13.1250   
4  20210104 04:59:00  1609754340000  13.1568  13.1568  13.1568  13.1568   
5  20210104 05:14:00  1609755240000  13.1500  13.1500  13.1500  13.1500   
6  20210104 05:19:00  1609755540000  13.1380  13.1380  13.1380  13.1380   
7  20210104 05:21:00  1609755660000  13.1380  13.1380  13.1380  13.1380   
8  20210104 05:27:00  1609756020000  13.1375  13.1375  13.1375  13.1375   
9  20210104 05:58:00  1609757880000  13.1375  13.1375  13.1375  13.1375   

      vwap   volume  transactions  otc        date  date_str  
0  13.0792  21840.0            11  NaN  2021-01-04  20210104  
1  13.1130  45440.0            26

In [4]:
# Data summary
print("Data Summary:")
print(f"Total records: {len(all_data):,}")
print(f"Date range: {all_data['date'].min()} to {all_data['date'].max()}")
print(f"Number of unique dates: {all_data['date'].nunique()}")
print(f"Average records per day: {len(all_data) / all_data['date'].nunique():.1f}")

# Check for any missing data
print("\nMissing values per column:")
print(all_data.isnull().sum())


Data Summary:
Total records: 865,782
Date range: 2021-01-04 to 2025-05-30
Number of unique dates: 1107
Average records per day: 782.1

Missing values per column:
time                 0
timestamp            0
open                 0
high                 0
low                  0
close                0
vwap                 0
volume               0
transactions         0
otc             865782
date                 0
date_str             0
dtype: int64


In [5]:
# Convert date column to datetime type
all_data['date'] = pd.to_datetime(all_data['date'])

# Create a proper datetime column by combining date and time
all_data['datetime'] = pd.to_datetime(all_data['date'].dt.strftime('%Y-%m-%d') + ' ' + all_data['time'].str.split(' ').str[1])

# Check for duplicate rows
duplicates = all_data.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# Drop duplicates if any
if duplicates.sum() > 0:
    all_data = all_data.drop_duplicates()
    print(f"Dropped {duplicates.sum()} duplicate rows")
    print(f"New DataFrame shape: {all_data.shape}")

# Display the updated data types
print("\nUpdated data types:")
print(all_data.dtypes)


Number of duplicate rows: 0

Updated data types:
time                    object
timestamp                int64
open                   float64
high                   float64
low                    float64
close                  float64
vwap                   float64
volume                 float64
transactions             int64
otc                    float64
date            datetime64[ns]
date_str                object
datetime        datetime64[ns]
dtype: object


In [6]:
# Save the cleaned combined dataset for use in other notebooks
all_data.to_csv('combined_nvda_ohlc_clean.csv', index=False)
print("\nCleaned combined data saved to 'combined_nvda_ohlc_clean.csv'")
print("This file will be used as input for subsequent analysis notebooks.")



Cleaned combined data saved to 'combined_nvda_ohlc_clean.csv'
This file will be used as input for subsequent analysis notebooks.
