In [1]:
import yfinance as yf
import datetime
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
def get_historical_data(ticker_symbol, start_date, end_date):

    # Create a ticker object
    ticker = yf.Ticker(ticker_symbol)

    # Get historical data for the specified date range
    historical_data = ticker.history(start=start_date, end=end_date)

    # Add a 'Ticker' column with the ticker symbol
    historical_data['Ticker'] = ticker_symbol

    # Reset the index to have the date as a column
    historical_data.reset_index(inplace=True)

    return historical_data


In [3]:
start_date = datetime.datetime(1960, 1, 1)
end_date = datetime.datetime(2024, 12, 31)

In [4]:
ticker_symbols  = ['AAPL', 'AMZN', 'TSLA', 'MSFT', 'GOOGL', 'META', 'JPM', 'JNJ', 'XOM', 'NVDA', 'GE','KO', 'F',\
                   'IBM', 'PG', 'MMM', 'AXP', 'CVX', 'T', 'MCD', '^GSPC', '^DJI', '^IXIC','^RUT' ]

In [11]:
all_historical_data = []
for symbol in ticker_symbols:
    # Get historical data for the current ticker symbol
    historical_data = get_historical_data(symbol, start_date, end_date)
    
    # Append the historical data for the current ticker symbol to the list
    all_historical_data.append(historical_data)

# Concatenate all historical data into a single DataFrame
combined_data = pd.concat(all_historical_data, ignore_index=True)

# Display the combined data
combined_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
0,1980-12-12 00:00:00-05:00,0.099192,0.099623,0.099192,0.099192,469033600,0.0,0.0,AAPL
1,1980-12-15 00:00:00-05:00,0.094448,0.094448,0.094017,0.094017,175884800,0.0,0.0,AAPL
2,1980-12-16 00:00:00-05:00,0.087548,0.087548,0.087117,0.087117,105728000,0.0,0.0,AAPL
3,1980-12-17 00:00:00-05:00,0.089273,0.089704,0.089273,0.089273,86441600,0.0,0.0,AAPL
4,1980-12-18 00:00:00-05:00,0.091861,0.092292,0.091861,0.091861,73449600,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...,...
278962,2024-03-12 00:00:00-04:00,2066.149902,2072.949951,2052.929932,2065.479980,4080510000,0.0,0.0,^RUT
278963,2024-03-13 00:00:00-04:00,2062.750000,2080.189941,2062.350098,2071.709961,4282890000,0.0,0.0,^RUT
278964,2024-03-14 00:00:00-04:00,2064.280029,2064.280029,2017.280029,2031.180054,4687970000,0.0,0.0,^RUT
278965,2024-03-15 00:00:00-04:00,2020.630005,2041.880005,2018.949951,2039.319946,7753670000,0.0,0.0,^RUT


In [12]:
combined_data.to_csv('ticker_data.csv')

In [6]:
tickers = combined_data['Ticker'].unique()

In [7]:
predictions_2024 = {}

for ticker in tickers:
    # Filter data for the current ticker
    ticker_df = combined_data[combined_data['Ticker'] == ticker]
    
    # Splitting into features (X) and target variable (y)
    X = ticker_df[['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']]
    y = ticker_df['Close']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define parameter grid for grid search
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [50, 100, 200],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }

    # Initialize XGBoost regressor
    xgb_model = XGBRegressor()

    # Perform grid search
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    # Get best parameters
    best_params = grid_search.best_params_

    # Use best model for predictions
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Model evaluation
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Performance metrics for {ticker}:")
    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)
    print("R-squared Score:", r2)

    # Prediction for 2024
    X_2024 = ticker_df.tail(1)[['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']]
    predicted_price_2024 = best_model.predict(X_2024)
    predictions_2024[ticker] = predicted_price_2024[0]


Performance metrics for AAPL:
Mean Squared Error (MSE): 0.2926682344155394
Mean Absolute Error (MAE): 0.17454042083709256
R-squared Score: 0.9998406716651025
Performance metrics for AMZN:
Mean Squared Error (MSE): 0.2010704260619034
Mean Absolute Error (MAE): 0.18142237386312599
R-squared Score: 0.9999192200228055
Performance metrics for TSLA:
Mean Squared Error (MSE): 1.3692443647015529
Mean Absolute Error (MAE): 0.5514744854870477
R-squared Score: 0.9998612919437039
Performance metrics for MSFT:
Mean Squared Error (MSE): 1.5318008114255455
Mean Absolute Error (MAE): 0.4889866582434897
R-squared Score: 0.999777398944067
Performance metrics for GOOGL:
Mean Squared Error (MSE): 0.07583784759655078
Mean Absolute Error (MAE): 0.1501796937133913
R-squared Score: 0.9999529200230509
Performance metrics for META:
Mean Squared Error (MSE): 0.6906120242947997
Mean Absolute Error (MAE): 0.5427784567711337
R-squared Score: 0.9999230526477436
Performance metrics for JPM:
Mean Squared Error (MSE): 

In [8]:
print("\nPredicted stock prices for the end of 2024:")
for ticker, price in predictions_2024.items():
    print(f"{ticker}: {price}")


Predicted stock prices for the end of 2024:
AAPL: 174.84368896484375
AMZN: 174.9945068359375
TSLA: 174.51162719726562
MSFT: 408.0936279296875
GOOGL: 147.60194396972656
META: 496.2339782714844
JPM: 180.15707397460938
JNJ: 156.67526245117188
XOM: 113.30441284179688
NVDA: 792.8280029296875
GE: 169.05841064453125
KO: 59.95356750488281
F: 12.134787559509277
IBM: 178.36154174804688
PG: 159.2642822265625
MMM: 104.71663665771484
AXP: 215.69224548339844
CVX: 155.5887451171875
T: 17.309261322021484
MCD: 278.5050354003906
^GSPC: 5008.52294921875
^DJI: 38879.51171875
^IXIC: 15986.876953125
^RUT: 2032.547607421875
