In [53]:
!pip install ta --quiet

In [54]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pandas_datareader as pdr
import matplotlib.pyplot as plt
import seaborn as sns

# Time range: past year
end_date = datetime.now()
start_date = end_date - timedelta(days=1095)

# Download data and include adjusted close
amzn_data = yf.download("AMZN", start=start_date, end=end_date, auto_adjust=False)

# Get P/E ratio
amzn = yf.Ticker("AMZN")
eps = amzn.info.get('trailingEps', np.nan)
amzn_data['PE_Ratio'] = amzn_data['Adj Close'] / eps

# Check available columns
print("Columns:", amzn_data.columns)

# Daily Volatility (using 7-day rolling standard deviation of log returns)
amzn_data['Log_Return'] = (amzn_data['Adj Close'] / amzn_data['Adj Close'].shift(1)).apply(np.log)
amzn_data['Volatility_7d'] = amzn_data['Log_Return'].rolling(window=7).std()

# MA past week
amzn_data['MA_7'] = amzn_data['Adj Close'].rolling(window=7).mean()
# MA past month
amzn_data['MA_30'] = amzn_data['Adj Close'].rolling(window=30).mean()
# MA past year
amzn_data['MA_365'] = amzn_data['Adj Close'].rolling(window=365).mean()
# Daily Percent change
amzn_data['Pct_Change'] = amzn_data['Adj Close'].pct_change()
# Simulated sentiment based on daily percent change
amzn_data['Sentiment'] = amzn_data['Pct_Change'].apply(
    lambda x: 'Bullish' if x >= 0 else 'Bearish')

# Remove multi-index
amzn_data.columns = [col[0] if isinstance(col, tuple) else col for col in amzn_data.columns]
print("Columns:", amzn_data.columns)

# Display last few rows
print(amzn_data.tail())

[*********************100%***********************]  1 of 1 completed


Columns: MultiIndex([('Adj Close', 'AMZN'),
            (    'Close', 'AMZN'),
            (     'High', 'AMZN'),
            (      'Low', 'AMZN'),
            (     'Open', 'AMZN'),
            (   'Volume', 'AMZN'),
            ( 'PE_Ratio',     '')],
           names=['Price', 'Ticker'])
Columns: Index(['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume', 'PE_Ratio',
       'Log_Return', 'Volatility_7d', 'MA_7', 'MA_30', 'MA_365', 'Pct_Change',
       'Sentiment'],
      dtype='object')
             Adj Close       Close        High         Low        Open  \
Date                                                                     
2025-04-21  167.320007  167.320007  169.600006  165.289993  169.600006   
2025-04-22  173.179993  173.179993  176.779999  169.350006  169.850006   
2025-04-23  180.600006  180.600006  187.380005  180.190002  183.449997   
2025-04-24  186.539993  186.539993  186.740005  180.179993  180.919998   
2025-04-25  188.990005  188.990005  189.940002  185.490005

In [55]:
# Adjust data for logistic regression

# Drop unecessary columns
amzn_data = amzn_data.drop(['Close', 'High', 'Low', 'Open'], axis=1)

# Distance Close vs MA_7
amzn_data['Close vs MA_7'] = (amzn_data['Adj Close'] - amzn_data['MA_7']) / amzn_data['MA_7']
amzn_data['Close vs MA_7'] *= 100
# Distance MA_7 to MA_30
amzn_data['MA_7 vs MA_30'] = (amzn_data['MA_7'] - amzn_data['MA_30']) / amzn_data['MA_30']
amzn_data['MA_7 vs MA_30'] *= 100
# Distance MA_30 to MA_365
amzn_data['MA_30 vs MA_365'] = (amzn_data['MA_30'] - amzn_data['MA_365']) / amzn_data['MA_365']
amzn_data['MA_30 vs MA_365'] *= 100
amzn_data = amzn_data.drop(['MA_7', 'MA_30', 'MA_365'], axis=1)

# Normalize volume by 30-day average volume
amzn_data['Avg_Volume_30d'] = amzn_data['Volume'].rolling(window=30).mean()
amzn_data['Volume_Normalized_30d'] = amzn_data['Volume'] / amzn_data['Avg_Volume_30d']
amzn_data = amzn_data.drop(['Volume', 'Avg_Volume_30d'], axis=1)

# Yesterday's % change
amzn_data['Pct_Change'] *= 100
amzn_data['Yesterday Pct_Change'] = amzn_data['Pct_Change'].shift(1)
amzn_data['Yesterday Pct_Change'] *= 100

# Create 5-day momentum
amzn_data['5_Day_Momentum'] = amzn_data['Adj Close'] / amzn_data['Adj Close'].shift(4)


# Normalize P/E ratio by historical median
pe_median = amzn_data['PE_Ratio'].median()
amzn_data['PE_Ratio_Normalized'] = amzn_data['PE_Ratio'] / pe_median
amzn_data = amzn_data.drop(['PE_Ratio'], axis=1)

# Create 14-day RSI
delta = amzn_data['Adj Close'].diff()
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)
avg_gain = gain.rolling(window=14, min_periods=14).mean()
avg_loss = loss.rolling(window=14, min_periods=14).mean()
rs = avg_gain / avg_loss
amzn_data['RSI_14'] = 100 - (100 / (1 + rs))
amzn_data.drop(['Adj Close'], axis=1)

Unnamed: 0_level_0,Log_Return,Volatility_7d,Pct_Change,Sentiment,Close vs MA_7,MA_7 vs MA_30,MA_30 vs MA_365,Volume_Normalized_30d,Yesterday Pct_Change,5_Day_Momentum,PE_Ratio_Normalized,RSI_14
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-04-29,,,,Bearish,,,,,,,0.874237,
2022-05-02,0.001757,,0.175809,Bullish,,,,,,,0.875774,
2022-05-03,-0.001982,,-0.197990,Bearish,,,,,17.580913,,0.874040,
2022-05-04,0.013390,,1.348047,Bullish,,,,,-19.799045,,0.885822,
2022-05-05,-0.078622,,-7.561036,Bearish,,,,,134.804685,0.936640,0.818845,
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-21,-0.031127,0.022598,-3.064708,Bearish,-5.701814,-6.088598,1.720136,0.850708,-98.663523,0.918735,1.176984,37.414969
2025-04-22,0.034423,0.024697,3.502262,Bullish,-1.763346,-6.343566,1.285156,1.004023,-306.470846,0.964308,1.218205,40.014715
2025-04-23,0.041953,0.029547,4.284567,Bullish,2.801386,-6.402602,0.937549,1.119482,350.226219,1.035966,1.270400,42.191146
2025-04-24,0.032361,0.031772,3.289029,Bullish,5.802276,-5.859718,0.651169,0.771855,428.456735,1.080702,1.312183,54.671873


In [56]:
amzn_data.describe()

Unnamed: 0,Adj Close,Log_Return,Volatility_7d,Pct_Change,Close vs MA_7,MA_7 vs MA_30,MA_30 vs MA_365,Volume_Normalized_30d,Yesterday Pct_Change,5_Day_Momentum,PE_Ratio_Normalized,RSI_14
count,750.0,749.0,743.0,749.0,744.0,721.0,386.0,721.0,748.0,746.0,750.0,736.0
mean,149.739601,0.00056,0.020785,0.082744,0.186076,0.792338,25.239953,0.995033,8.109902,1.003099,1.053317,52.691598
std,40.888884,0.023138,0.010105,2.318796,3.120777,4.924316,9.416517,0.370845,231.991008,0.046044,0.287626,15.456503
min,81.82,-0.094081,0.004918,-8.979129,-13.091996,-16.597182,0.493979,0.373715,-897.91293,0.830209,0.575549,15.687199
25%,115.002501,-0.012169,0.013895,-1.209516,-1.650563,-2.591694,17.4403,0.766906,-121.008786,0.979093,0.808965,40.739897
50%,142.160004,0.000423,0.018606,0.042309,0.31548,1.33816,25.979437,0.913697,4.178649,1.00333,1.0,54.347343
75%,183.727501,0.014053,0.024431,1.415267,1.991474,3.983802,31.991625,1.094102,141.642012,1.029796,1.292399,64.298401
max,242.059998,0.114915,0.067685,12.177848,9.715708,14.828901,41.030624,3.466735,1217.784762,1.179253,1.702729,85.676701


In [57]:
amzn_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 750 entries, 2022-04-29 to 2025-04-25
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Adj Close              750 non-null    float64
 1   Log_Return             749 non-null    float64
 2   Volatility_7d          743 non-null    float64
 3   Pct_Change             749 non-null    float64
 4   Sentiment              750 non-null    object 
 5   Close vs MA_7          744 non-null    float64
 6   MA_7 vs MA_30          721 non-null    float64
 7   MA_30 vs MA_365        386 non-null    float64
 8   Volume_Normalized_30d  721 non-null    float64
 9   Yesterday Pct_Change   748 non-null    float64
 10  5_Day_Momentum         746 non-null    float64
 11  PE_Ratio_Normalized    750 non-null    float64
 12  RSI_14                 736 non-null    float64
dtypes: float64(12), object(1)
memory usage: 82.0+ KB
