In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf

**To help traders understand if NVDA offers a better risk-adjusted return than a diversified tech index.**

The project explores the historical relationship between NVIDIA and NASDAQ 100 (QQQ). By analyzing rolling correlations and daily return distributions, we identify regime Shift periods where NVIDIA either decoupled from or led the broader technology market.

Key Finding: While historically volatile, NVDA has shown a tightening correlation with the QQQ in recent quarters, suggesting its transition from a speculative asset to a primary driver of the index itself. As well NVDIA often moves independently from NASDAQ and is much more volatile on a macro scale.

**DATA SOURCES:**

**NVDA:** 
Daily price data was fetched from the yahoo finance API.
**QQQ:** 
plit-adjusted daily price data was found on Kaggle.
**Timeline:** 
Jan 01 2001 till 2024

In [None]:
#Helper Functions:

#Box plot to show the spread of volatiliy in both assets
def plot_volatility_spread(df, cols, title):
    plt.figure(figsize=(14, 6))
    
    sns.boxplot(df[cols])
    
    plt.title(title, fontsize=14)
    plt.ylabel("Daily Percentage Change")
    plt.axhline(0, color='black', linestyle='--', alpha=0.3) 
    plt.grid(axis='y', linestyle='--', alpha=0.4)
    
#function to plot quarterly trend of QQQ and NVDA
def plot_quarterly_trends(stats_df, asset_col, market_col, title="Quarterly Performance"):
    
    plt.figure(figsize=(14, 6))
    
    sns.lineplot(data=stats_df, x=stats_df.index, y=asset_col, label=asset_col, marker='o')
    sns.lineplot(data=stats_df, x=stats_df.index, y=market_col, label=market_col, marker='s')
    
    plt.title(title, fontsize=14)
    plt.ylabel("Total Quarterly Return (%)")
    plt.xlabel("Quarter")
    plt.xticks(rotation=90, fontsize=8) 
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()
    
#pearsons correllation matrix
def plot_correlation_matrix(df, cols):
    plt.figure(figsize=(14, 6))
    corr = df[cols].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title("Correlation Matrix: Daily Returns")
    plt.show()

#Freuquency of Daily Returns Histogram Function   
def plot_risk_shape(df, cols, title):
    
    plt.figure(figsize=(14,6))
    for col in cols:
        sns.histplot(df[col], kde=True, element="step", label=col, alpha=0.5)
    
    plt.title(title, fontsize=14)
    plt.xlabel("Daily Return (%)")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()
    
#Function to calculate quarterly correlation
def quart_corr(df, asset_col, market_col):
    # Grouping by the actual calendar quarter (Q-DEC means year ends in Dec)
    q_corr = df.groupby(df['date'].dt.to_period('Q')).apply(
    lambda x: x[asset_col].corr(x[market_col])
    )
    
    return q_corr

#Function to calculate rolling correlation
def roll_corr(df, asset_col, market_col, window):
    # Grouping by the actual calendar quarter (Q-DEC means year ends in Dec)
    rolling_corr = df[asset_col].rolling(window).corr(df[market_col])
    return rolling_corr


#Function to calculated overall correlation
def get_overall_corr(df, asset_col, market_col):
    return df[asset_col].corr(df[market_col])
    
    

#Regime bar plot function
def plot_regime(df, asset_col, market_col, asset_name="Asset", rwindow=60, title="Regime Analysis"):
    
    #call quarterly correlation calculator function 
    rCorrelation = roll_corr(df, asset_col, market_col, 60)
    baseline = get_overall_corr(df, asset_col, market_col)
  
    plt.figure(figsize=(20, 8))

    # Plot the Quarterly Correlation as a Bar Chart
    plt.plot(df['date'], rCorrelation, label=f'{rwindow}-Day Rolling Correlation', color="blue")

    # Plot the Overall Correlation as a solid Horizontal Line
    plt.axhline(y=baseline, color='red', linestyle='--', linewidth=2, label=f'24-Year Average ({baseline:.2f})')


    plt.grid(True, which='major', linestyle='-', alpha=0.7)
    plt.grid(True, which='minor', linestyle=':', alpha=0.2)
    plt.title(title, fontsize=14)
    plt.ylabel("Correlation Coefficient")
    plt.xlabel("Years")
    plt.legend()
    plt.xticks(rotation=(90), fontsize = 8)
    plt.show()
    
#Regime bar plot function
def plot_regime_bars(df, asset_col, market_col, asset_name="Asset", market_name="Market", title="graphTitle", xTitle = "xlabel", yTitle="yTitle"):
    
    #call quarterly correlation calculator function 
    quarterlyTable = quart_corr(df, asset_col, market_col)
    
    #Get the overall correlation value
    overall_corr_val = df[asset_col].corr(df[market_col])
    # 2. Create the plot
    plt.figure(figsize=(20, 8))

    # Plot the Quarterly Correlation as a Bar Chart
    quarterlyTable.plot(kind='bar', color='lightgray', alpha=0.7, label='Quarterly Regime')

    # Plot the Overall Correlation as a solid Horizontal Line
    plt.axhline(y=overall_corr_val, color='red', linestyle='--', linewidth=2, label=f'24-Year Average ({overall_corr_val:.2f})')

    # Customizing the look
    plt.title(title, fontsize=14)
    plt.ylabel(yTitle)
    plt.xlabel(xTitle)
    plt.legend()
    plt.xticks(rotation=(90), fontsize = 8)
    plt.show()


    


In [None]:
#import NVDA ticker data

try:
    nvda_data = yf.download("NVDA", start="2000-01-01", auto_adjust=True, multi_level_index=False)
    
    if nvda_data.empty:
        print("Warning: Downloaded dataframe is empty. Check ticker or connection.")
    else:
        print("Data successfully downloaded.")
        print(nvda_data.head())
except Exception as err:
    print(f"\nError downloading: {err}")


In [None]:
#flattening of columns to allows for analysis
nvda_data.columns = nvda_data.columns.get_level_values(0)

#Make date a column and not an index 
nvda_data = nvda_data.reset_index()

nvda_data.head()

**Flattening Headers:**

I reset the multi-level index from the API call to ensure the dataframe is "flat" and compatible with standard Pandas operation

In [None]:
#Load and Inspect QQQ data

QQQ_data = pd.read_csv("market_data/QQQ_split_adj.csv")

QQQ_data.head()

In [None]:
#Data clean up and changing NVDA column names to match QQQ and label prices

nvda_data = nvda_data.rename(columns={
    "Date": "date",
    "Close": "close_NVDA",
    "High": "high_NVDA",
    "Low": "low_NVDA",
    "Open": "open_NVDA",
    "Volume": "volume_NVDA"
    })

#Rename QQQ price columns

QQQ_data = QQQ_data.rename(columns={
    "date": "date",
    "close": "close_QQQ",
    "high": "high_QQQ",
    "low": "low_QQQ",
    "open": "open_QQQ",
    "volume": "volume_QQQ"
    })

#Enforce dateTime object casting
nvda_data['date'] = pd.to_datetime(nvda_data['date'])
QQQ_data['date'] = pd.to_datetime(QQQ_data['date'])

#Filter QQQ data to start from 2000

QQQ_data = QQQ_data[QQQ_data['date'] >= '2000-01-01']

#Drop redundant columns

QQQ_data = QQQ_data.drop(["raw_close", "change_percent", "avg_vol_20d"], axis = 1, errors = 'ignore')

QQQ_data.head()




**Refactoring Table columns:**

I renamed the column names to ensure both tables are compatible with each other. I also removed unimportant columns fron the QQQ dataset.

In [None]:
#Remove ghost header

nvda_data.columns.name = None
nvda_data.head()

In [None]:
#Table merging

data_study = pd.merge(nvda_data, QQQ_data, on='date', how='inner')

data_study.head()

**Inner merge:**
I joined the NVIDIA API data with the local QQQ CSV on the date column using an Inner Join. This ensures we only analyze days where data is present for both assets, automatically handling missing holiday data.

In [None]:
data_study.tail()

In [None]:
#Reset Index

data_study=data_study.reset_index(drop=True)
data_study.head()

In [None]:
#check for missng values (NaN)

print(data_study.isnull().sum())

In [None]:
#Check for duplicates

print(data_study.duplicated().sum())

**Sanity Check (NaN) and Duplicates:**

Ensure that the new dataframe is free of null values and duplicate fields.

In [None]:
#Check Data types

data_study.dtypes

In [None]:
#Size Inspection
data_study.shape

In [None]:
#Calculate the percetage change between the current day close and previous close
data_study['NVDA_pct'] = data_study['close_NVDA'].pct_change()
data_study['QQQ_pct'] = data_study['close_QQQ'].pct_change()

data_study.head()

Added 2 new columns for the daily percent chance of each asset (NVDA percentage change and QQQ percentage change). This was to ensure that the data is normalized and comparable.

In [None]:
#Drop initial empty row
data_study = data_study.dropna()

data_study.head()

**Data clean up:**

Calculating daily percentage change leaves the first row of data with a NULL value this the row needs to be discarded.

In [None]:
#Box plot to show the spread of volatiliy in both assets

plot_volatility_spread(data_study ,["NVDA_pct", "QQQ_pct"], "Distribution of daily return of NVDA and QQQ from 2000-2024")
    
    



**Risk and Volatility Profile:**
Before looking at correlations, we compare the daily return distributions to understand the risk profile of each asset.

**Observation:**
NVDA shows significantly wider whiskers and more frequent extreme outliers compared to the QQQ. This confirms that while they share a median return of ~0%, NVDA’s "swings" are much more violent.

In [None]:
print(data_study[['NVDA_pct', 'QQQ_pct']].describe())

The describe function confirmed the near 0% mean for both assets.

In [None]:
#Group daily returns by quarters
q_percentent = data_study.groupby(data_study['date'].dt.to_period('Q')).apply(lambda g: g[['NVDA_pct','QQQ_pct']].sum())
#change the quarter index to a string
q_percentent.index = q_percentent.index.astype(str)

#call quarterly trend plot
plot_quarterly_trends(q_percentent, "NVDA_pct", "QQQ_pct", title="Quarterly Total Returns: NVDA vs QQQ (2000-2024)")




**Total quarterly Returns:**
Based on the daily volatily profile I wanted to investigate daily quarterly returns of each asset over a period of time to see when spikes in price occur and investigate possible underlying causes.

**Observation:**

During the first quarter of 2000 NASDAQ(QQQ) was at a major peak along with NVDIA then collapsed during the 4th quarter. This coincides with the Dot-com bubble crash where by tech start-ups were heavily over-speculated in the late 90's then lost most of their valuation in the earlyer 2000's. Both asseets had another major crash during the #rd quarter of 20001 and this aligns with the September 9/11 attacks. The last quarter of 2001 proved to be the largest quarterly spike to the up-side for both assets with an increase of almost 2% attributed to a massive market recovery post 911 because of an increase in investor sentiment and FED rate cuts. This also aligned with the launch of the XBOX of which NVIDA was the sole provider for its GPU giving the company record breaaking profits in this period despite the Dot-com crash. Other drops in price coincide with the 2007–2009 recesion which saw stock prices plummet and the COVID-19 Pandemic Crash. Recently both assets have has positive quarterly percentage changes due to the AI-bubbles of which many companies in the NASDAQ have saw increase in stock valuations from including NVIDIA.

In [None]:

# Execution
plot_correlation_matrix(data_study, ["NVDA_pct", "QQQ_pct"])
    

**Pearson Correlation Matrix: The Static Relationship:**

Before diving into the time-based regime shifts we look at the Pearson Correlation Coefficient to determine the overall linear relationship between NVDA and QQQ across the entire dataset.

**Observation:**

We have a strong positive relationship, confirming that NVIDIA is a primary component of the Nasdaq's price action. For an investor, a correlation this high suggests that holding both NVDA and QQQ does not provide significant diversification benefit, as they tend to experience gains and losses at the same time. This would in turn leave your portfolio overexposed.

In [None]:
#Histogram to show the shape of the risk
plot_risk_shape(data_study, ["NVDA_pct", "QQQ_pct"], "Frequency of returns")

**Analysis of the Distribution:**

While summary statistics provide a numeric overview, a Histogram with Kernel Density Estimation allows us to visualize the probability distribution of daily returns. we are specifically looking for deviations from a normal distribution. 

**Observation:**
NVDA experiences far more frequent extreme daily moves (outliers) than the indeX, while QQQ is muc more stable with its distribution being tightly clustered and around 0 thus percentage changes are smaller and more predicatble making ot the better asset to invest in for long term returns. NVDA stretches much further along the x-axis, reaching beyond -0.3 (-30%) and +0.4 (+40%). This represents tail risk, meaning extreme price movements are much more frequent for an individual stock like NVDA than for a diversified index like QQQ. Both distributions peak very close to 0.0, which is typical for daily return data. This suggests that while the swings are large, the most frequent daily outcome for both is a relatively small change.

In [None]:
#Proof that tails exist but are invisible

print("="*70)
print("TAIL EVENT FREQUENCY")
print("="*70)

thresholds = [0.05, 0.10, 0.15, 0.20]

for threshold in thresholds:
    nvda_count = len(data_study[abs(data_study['NVDA_pct']) > threshold])
    qqq_count = len(data_study[abs(data_study['QQQ_pct']) > threshold])
    
    print(f"\nDays with moves > ±{threshold*100:.0f}%:")
    print(f"  NVDA: {nvda_count:3d} days ({nvda_count/len(data_study)*100:.1f}%)")
    print(f"  QQQ:  {qqq_count:3d} days ({qqq_count/len(data_study)*100:.1f}%)")

print("\n These tail events exist but just rare (<<1% of days)")


**NVDA exhibits significantly fatter tails than the broader NASDAQ index (QQQ).**
**Evidence:** 
While NVDA moves ±5% or more on 12.4% of trading days, QQQ only does so 1.9% of the time—a 6.4x difference. This gap widens at extreme thresholds: 
NVDA has experienced 16 days with ±20% moves over 24 years, while QQQ has never moved 20% in a single day.
Implication: Standard correlation analysis assumes bivariate normality, which clearly doesn't hold here. This non-normality helps explain why our quarterly correlation varies so much during extreme NVDA events (which happen regularly), the relationship with QQQ can break down entirely

In [None]:
plot_regime_bars(data_study, "NVDA_pct", "QQQ_pct", "NVIDIA", "NASDAQ", 'NVDA vs QQQ: Identifying Regime Deviations', 'Correlation Coefficient', 'Quarter') 

**Quarterly Regime Blocks: Historical Deviations:**

Finally, we group these correlations by quarter to create a Seasonality Map. This makes it easier to pinpoint specific historical events where the correlation fundamentally broke away from the long-term average. This approach allows us to see if the recent high correlation is an anomaly or a permanent shift in the market's structure due to NVDA's massive index weight.

**Observations:**
**Analysis: Historical Deviations & Seasonal Regimes**

The quarterly bar chart provides a "high-level" map of the NVDA-QQQ relationship. By comparing each quarter's performance against the **24-year baseline (Red Dashed Line)**, we can observe several key financial phenomena:

**Mean Reversion:** Historically, whenever the correlation "decouples" (bars fall significantly below the average), it tends to return to the mean in subsequent quarters. This suggests a strong, persistent fundamental link between the semiconductor sector and the broader tech index.
**The "Grey" Gaps (Idiosyncratic Risk):** Quarters with very low bars represent periods where NVIDIA's price was driven by internal factors—such as product breakthroughs (original GeForce/Xbox launches) or specific earnings misses—independent of the general Nasdaq trend.
**The AI Structural Shift (2023–Present):** There's high density of the bars in the most recent years. Unlike previous decades where correlation fluctuated wildly, the recent regime shows bars consistently pinned at or above the historical average.

**Final Insight:** This confirms that the recent high correlation is not just a temporary "spike" but a **structural shift**. As NVDA’s market cap has grown, its mathematical weight in the QQQ has made decoupling much rarer, effectively turning NVDA into a primary engine of the index itself.

In [None]:
plot_regime(data_study, "NVDA_pct", "QQQ_pct", rwindow=60, title="NVDA vs QQQ: 60-Day Rolling Correlation") 

**Rolling Correlation (60 day):**
We use a 60-Day Rolling Pearson Correlation to track the sync between the two assets. Static correlation matrices only tell us the average relationship over 24 years. To understand how the NVDA-QQQ relationship evolves during market crashes, tech booms, and the recent AI surge, we must look at how correlation changes over time.. The red dashed line represents the 24-year historical average baseline. The blue line is the 60-day rolling correlation.

**Observation:**
Despite the high average, the correlation is highly unstable, frequently swinging between 0.2 (low relationship) and 0.9 (nearly identical movement).Looking at the data from 2020 to 2024, the correlation has frequently stayed above 0.8. This suggests that in the current AI-driven market, NVDA and the QQQ are more locked together than they were in the early 2000s.

**#Final Conclusion**

**Conclusion: Volatility and Risk Profile**
Based on the Exploratory Data Analysis (EDA), we observed a significant divergence in the risk profiles of NVDA and QQQ:

Heightened Volatility: NVDA displays a much wider distribution of daily returns, with frequent "fat-tail" events exceeding ±10%. In contrast, QQQ remains tightly clustered around a 0% mean return.

Consistent Correlation: The long-term rolling correlation remains high at 0.63, suggesting that while NVDA is more volatile, it is still fundamentally tethered to the broader tech sector's movements.

**Strategic Implications:**
For a trader or investor, these findings suggest:

Risk Management: Investors in NVDA must utilize wider stop-losses compared to QQQ to avoid being liquidated by standard daily market noise.

Portfolio Concentration: Because the correlation has trended higher in recent years (often exceeding 0.8), holding both assets simultaneously provides less diversification benefit than historically expected.

**Future Work**
Next Steps for Research
To further this project, I intend to:

Perform a Beta Analysis to quantify exactly how much NVDA moves relative to every 1% move in the QQQ.

Implement a Value at Risk (VaR) calculation to estimate the maximum potential loss over a 24-hour period at a 95% confidence level.
