## Install Required Libraries

In [1]:
#!pip install yfinance pandas

## Import Libraries

In [3]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt

## Define the Strategy

Objective: Predict whether SMH’s price will increase (1) or decrease (0) the next day.
Data Range: January 1, 2010, to March 26, 2025.
Adjustments: Ensure yfinance accounts for SMH’s 2-for-1 stock split in May 2023.

## Fetch SMH Historical Data

In [4]:
# Define SMH ticker and date range
ticker = "SMH"
start_date = "2010-01-01"
end_date = "2025-03-26"

# Fetch historical data with splits/dividends adjusted
smh_data = yf.download(
    ticker,
    start=start_date,
    end=end_date,
    progress=False  # Disable progress bar for cleaner output
)

# Save raw data to CSV (for reference)
smh_data.to_csv("SMH_Raw_Data.csv")

# Display first 5 rows
smh_data.head()

YF.download() has changed argument auto_adjust default to True


Price,Close,High,Low,Open,Volume
Ticker,SMH,SMH,SMH,SMH,SMH
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2010-01-04,12.099959,12.206436,12.05311,12.074405,18905400
2010-01-05,12.031816,12.155328,11.959412,12.116997,29157800
2010-01-06,12.002001,12.116995,11.95941,12.014777,21329000
2010-01-07,11.908303,11.993485,11.823123,11.963671,28677800
2010-01-08,12.146811,12.159587,11.869972,11.899786,41759000


In [8]:
# Load data, skip initial rows if headers are misaligned
df = pd.read_csv("SMH_Raw_Data.csv", skiprows=2)  # Adjust skiprows based on your file

# Display raw data
print("Raw Data:")
display(df.head())

Raw Data:


Unnamed: 0,Date,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,2010-01-04,12.099959,12.206436,12.05311,12.074405,18905400
1,2010-01-05,12.031816,12.155328,11.959412,12.116997,29157800
2,2010-01-06,12.002001,12.116995,11.95941,12.014777,21329000
3,2010-01-07,11.908303,11.993485,11.823123,11.963671,28677800
4,2010-01-08,12.146811,12.159587,11.869972,11.899786,41759000


In [10]:
# Rename columns (6 columns total)
df.columns = ["Date", "Close", "High", "Low", "Open", "Volume"]

# Reorder columns to standard format
df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]

# Display cleaned data
print("\nCleaned Data:")
display(df.head())


Cleaned Data:


Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2010-01-04,12.074405,12.206436,12.05311,12.099959,18905400
1,2010-01-05,12.116997,12.155328,11.959412,12.031816,29157800
2,2010-01-06,12.014777,12.116995,11.95941,12.002001,21329000
3,2010-01-07,11.963671,11.993485,11.823123,11.908303,28677800
4,2010-01-08,11.899786,12.159587,11.869972,12.146811,41759000


In [12]:
# Convert "Date" to datetime
df["Date"] = pd.to_datetime(df["Date"], format="%Y/%m/%d")  # Use format="%m/%d/%Y" if dates are MM/DD/YYYY

# Sort by date (oldest to newest)
df = df.sort_values("Date")

# Display final data
print("\nFinal Data with Corrected Dates:")
display(df.head())


Final Data with Corrected Dates:


Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2010-01-04,12.074405,12.206436,12.05311,12.099959,18905400
1,2010-01-05,12.116997,12.155328,11.959412,12.031816,29157800
2,2010-01-06,12.014777,12.116995,11.95941,12.002001,21329000
3,2010-01-07,11.963671,11.993485,11.823123,11.908303,28677800
4,2010-01-08,11.899786,12.159587,11.869972,12.146811,41759000


In [13]:
df.to_csv("SMH_Clean_Data.csv", index=False)
print("Data saved to SMH_Clean_Data.csv")

Data saved to SMH_Clean_Data.csv


##  Validate Stock Split Adjustment
Check if the May 2023 stock split is reflected in the data.
The split-adjusted closing price should be halved after May 5, 2023: