In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf

In [17]:
#import NVDA ticker data
nvda_data = yf.download("NVDA", start="2014-01-01", auto_adjust=True)
nvda_data.head()

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,NVDA,NVDA,NVDA,NVDA,NVDA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2014-01-02,0.373864,0.376693,0.370564,0.375278,260092000
2014-01-03,0.369385,0.375278,0.368207,0.374571,259332000
2014-01-06,0.374335,0.377164,0.369621,0.373157,409492000
2014-01-07,0.380464,0.381879,0.375514,0.378107,333288000
2014-01-08,0.38565,0.387536,0.380464,0.381879,308192000


In [18]:
#flattening of columns to allows for analysis
nvda_data.columns = nvda_data.columns.get_level_values(0)

#Make date a column and not an index 
nvda_data = nvda_data.reset_index()

nvda_data.head()

Price,Date,Close,High,Low,Open,Volume
0,2014-01-02,0.373864,0.376693,0.370564,0.375278,260092000
1,2014-01-03,0.369385,0.375278,0.368207,0.374571,259332000
2,2014-01-06,0.374335,0.377164,0.369621,0.373157,409492000
3,2014-01-07,0.380464,0.381879,0.375514,0.378107,333288000
4,2014-01-08,0.38565,0.387536,0.380464,0.381879,308192000


In [19]:
#Load and Inspect QQQ data

QQQ_data = pd.read_csv("market_data/QQQ_split_adj.csv")

QQQ_data.head()

Unnamed: 0,date,open,high,low,close,volume,raw_close,change_percent,avg_vol_20d
0,1999-03-10,51.0625,51.15625,50.28125,51.0625,5232200,102.125,,
1,1999-03-11,51.4375,51.7344,50.3125,51.3125,9688600,102.625,0.49,
2,1999-03-12,51.125,51.15625,49.65625,50.0625,8743600,100.125,-2.44,
3,1999-03-15,50.4375,51.5625,49.90625,51.5,6369000,103.0,2.87,
4,1999-03-16,51.71875,52.15625,51.15625,51.9375,4905800,103.875,0.85,


In [20]:
#Data clean up and changing NVDA column names to match QQQ and label prices

nvda_data = nvda_data.rename(columns={
    "Date": "date",
    "Close": "close_NVDA",
    "High": "high_NVDA",
    "Low": "low_NVDA",
    "Open": "open_NVDA",
    "Volume": "volume_NVDA"
    })

#Rename QQQ price columns

QQQ_data = QQQ_data.rename(columns={
    "date": "date",
    "close": "close_QQQ",
    "high": "high_QQQ",
    "low": "low_QQQ",
    "open": "open_QQQ",
    "volume": "volume_QQQ"
    })

#Enforce dateTime object casting
nvda_data['date'] = pd.to_datetime(nvda_data['date'])
QQQ_data['date'] = pd.to_datetime(QQQ_data['date'])

#Filter QQQ data to start from 2014

QQQ_data = QQQ_data[QQQ_data['date'] >= '2014-01-01']

#Drop redundant columns

QQQ_data = QQQ_data.drop(["raw_close", "change_percent", "avg_vol_20d"], axis = 1, errors = 'ignore')

QQQ_data.head()


Unnamed: 0,date,open_QQQ,high_QQQ,low_QQQ,close_QQQ,volume_QQQ
3728,2014-01-02,87.55,87.58,87.02,87.27,29190010
3729,2014-01-03,87.27,87.35,86.62,86.64,35727320
3730,2014-01-06,86.65,86.76,86.0,86.32,32092439
3731,2014-01-07,86.7,87.25,86.56,87.12,25913230
3732,2014-01-08,87.11,87.55,86.945,87.31,27209990


In [21]:
#Remove ghost header

nvda_data.columns.name = None
nvda_data.head()

Unnamed: 0,date,close_NVDA,high_NVDA,low_NVDA,open_NVDA,volume_NVDA
0,2014-01-02,0.373864,0.376693,0.370564,0.375278,260092000
1,2014-01-03,0.369385,0.375278,0.368207,0.374571,259332000
2,2014-01-06,0.374335,0.377164,0.369621,0.373157,409492000
3,2014-01-07,0.380464,0.381879,0.375514,0.378107,333288000
4,2014-01-08,0.38565,0.387536,0.380464,0.381879,308192000


In [22]:
#Table merging

data_study = pd.merge(nvda_data, QQQ_data, on='date', how='inner')

data_study.head()

Unnamed: 0,date,close_NVDA,high_NVDA,low_NVDA,open_NVDA,volume_NVDA,open_QQQ,high_QQQ,low_QQQ,close_QQQ,volume_QQQ
0,2014-01-02,0.373864,0.376693,0.370564,0.375278,260092000,87.55,87.58,87.02,87.27,29190010
1,2014-01-03,0.369385,0.375278,0.368207,0.374571,259332000,87.27,87.35,86.62,86.64,35727320
2,2014-01-06,0.374335,0.377164,0.369621,0.373157,409492000,86.65,86.76,86.0,86.32,32092439
3,2014-01-07,0.380464,0.381879,0.375514,0.378107,333288000,86.7,87.25,86.56,87.12,25913230
4,2014-01-08,0.38565,0.387536,0.380464,0.381879,308192000,87.11,87.55,86.945,87.31,27209990


In [23]:
data_study.tail()

Unnamed: 0,date,close_NVDA,high_NVDA,low_NVDA,open_NVDA,volume_NVDA,open_QQQ,high_QQQ,low_QQQ,close_QQQ,volume_QQQ
2583,2024-04-09,85.310455,87.590298,82.979652,87.397395,501700000,442.96,443.24,437.44,442.23,39521600
2584,2024-04-10,86.994614,87.35543,83.66631,83.883202,431929000,437.0,439.24,436.28,438.37,61502200
2585,2024-04-11,90.569794,90.692732,86.88168,87.375424,431637000,440.26,446.33,437.96,445.37,45474600
2586,2024-04-12,88.141037,90.129029,87.485373,89.653266,426805000,441.1,442.24,436.88,438.27,53602600
2587,2024-04-15,85.957146,90.566793,85.885184,89.052567,443077000,442.07,442.0901,430.21,431.06,62937433


In [26]:
#Reset Index

data_study=data_study.reset_index(drop=True)
data_study.head()

Unnamed: 0,date,close_NVDA,high_NVDA,low_NVDA,open_NVDA,volume_NVDA,open_QQQ,high_QQQ,low_QQQ,close_QQQ,volume_QQQ
0,2014-01-02,0.373864,0.376693,0.370564,0.375278,260092000,87.55,87.58,87.02,87.27,29190010
1,2014-01-03,0.369385,0.375278,0.368207,0.374571,259332000,87.27,87.35,86.62,86.64,35727320
2,2014-01-06,0.374335,0.377164,0.369621,0.373157,409492000,86.65,86.76,86.0,86.32,32092439
3,2014-01-07,0.380464,0.381879,0.375514,0.378107,333288000,86.7,87.25,86.56,87.12,25913230
4,2014-01-08,0.38565,0.387536,0.380464,0.381879,308192000,87.11,87.55,86.945,87.31,27209990


In [30]:
#check for missng values (NaN)

print(data_study.isnull().sum())

date           0
close_NVDA     0
high_NVDA      0
low_NVDA       0
open_NVDA      0
volume_NVDA    0
open_QQQ       0
high_QQQ       0
low_QQQ        0
close_QQQ      0
volume_QQQ     0
dtype: int64


In [31]:
#Check for duplicates

print(data_study.duplicated().sum())

0


In [33]:
#Check Data types

data_study.dtypes

date           datetime64[ns]
close_NVDA            float64
high_NVDA             float64
low_NVDA              float64
open_NVDA             float64
volume_NVDA             int64
open_QQQ              float64
high_QQQ              float64
low_QQQ               float64
close_QQQ             float64
volume_QQQ              int64
dtype: object

In [36]:
#Size Inspection
data_study.shape

(2588, 11)

In [39]:
#Calculate the percetage change between the current day close and previous close
data_study['NVDA_pct'] = data_study['close_NVDA'].pct_change()
data_study['QQQ_pct'] = data_study['close_QQQ'].pct_change()

data_study.head()

Unnamed: 0,date,close_NVDA,high_NVDA,low_NVDA,open_NVDA,volume_NVDA,open_QQQ,high_QQQ,low_QQQ,close_QQQ,volume_QQQ,NVDA_pct,QQQ_pct
0,2014-01-02,0.373864,0.376693,0.370564,0.375278,260092000,87.55,87.58,87.02,87.27,29190010,,
1,2014-01-03,0.369385,0.375278,0.368207,0.374571,259332000,87.27,87.35,86.62,86.64,35727320,-0.011979,-0.007219
2,2014-01-06,0.374335,0.377164,0.369621,0.373157,409492000,86.65,86.76,86.0,86.32,32092439,0.013401,-0.003693
3,2014-01-07,0.380464,0.381879,0.375514,0.378107,333288000,86.7,87.25,86.56,87.12,25913230,0.016373,0.009268
4,2014-01-08,0.38565,0.387536,0.380464,0.381879,308192000,87.11,87.55,86.945,87.31,27209990,0.013631,0.002181


In [41]:
#Drop initial empty row
data_study = data_study.dropna()

data_study.head()

Unnamed: 0,date,close_NVDA,high_NVDA,low_NVDA,open_NVDA,volume_NVDA,open_QQQ,high_QQQ,low_QQQ,close_QQQ,volume_QQQ,NVDA_pct,QQQ_pct
1,2014-01-03,0.369385,0.375278,0.368207,0.374571,259332000,87.27,87.35,86.62,86.64,35727320,-0.011979,-0.007219
2,2014-01-06,0.374335,0.377164,0.369621,0.373157,409492000,86.65,86.76,86.0,86.32,32092439,0.013401,-0.003693
3,2014-01-07,0.380464,0.381879,0.375514,0.378107,333288000,86.7,87.25,86.56,87.12,25913230,0.016373,0.009268
4,2014-01-08,0.38565,0.387536,0.380464,0.381879,308192000,87.11,87.55,86.945,87.31,27209990,0.013631,0.002181
5,2014-01-09,0.371271,0.380464,0.370092,0.379757,292172000,87.62,87.64,86.72,87.02,23695689,-0.037286,-0.003321
