In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import os
import yfinance as yf
import ta

In [22]:
# Load company data
company_data_path = r"C:\Users\Home\Downloads\Lambton\MHS\2nd Term\AI\Group Project\Demo\AI demo code 2\final_v2.csv"  # Use relative path
company_data = pd.read_csv(company_data_path)

In [23]:
# Create a target variable for classification (1 day ahead)
def classify_target(row):
    change = (row['Close_1_days_ahead'] - row['Close']) / row['Close']
    if change > 0.02:
        return 1  # Up
    elif change < -0.02:
        return 0  # Down
    else:
        return 2  # Neutral

company_data['Close_1_days_ahead'] = company_data['Close'].shift(-1)
company_data['Target'] = company_data.apply(classify_target, axis=1)
company_data = company_data[:-1]  # Remove the last 1 row with NaN target

company_data.head(15)


Unnamed: 0,Date,Company,Open,High,Low,Close,Volume,Sector,EMA_50,EMA_200,RSI,MACD,MACD_Signal,MACD_Hist,Season,Close_1_days_ahead,Target
0,1980-12-12,JNJ,0.644482,0.669556,0.644482,0.669556,5011200.0,Healthcare,,,,,,,Winter,0.677914,2
1,1980-12-15,JNJ,0.669556,0.688129,0.668627,0.677914,5932800.0,Healthcare,,,,,,,Winter,0.704845,1
2,1980-12-16,JNJ,0.677914,0.707631,0.676056,0.704845,9475200.0,Healthcare,,,,,,,Winter,0.709487,2
3,1980-12-17,JNJ,0.705773,0.713202,0.705773,0.709487,6801600.0,Healthcare,,,,,,,Winter,0.706702,2
4,1980-12-18,JNJ,0.709488,0.715989,0.703916,0.706702,4603200.0,Healthcare,,,,,,,Winter,0.711345,2
5,1980-12-19,JNJ,0.706701,0.715988,0.698343,0.711345,3321600.0,Healthcare,,,,,,,Winter,0.722489,2
6,1980-12-22,JNJ,0.714131,0.72806,0.714131,0.722489,4488000.0,Healthcare,,,,,,,Winter,0.718774,2
7,1980-12-23,JNJ,0.722489,0.726203,0.713202,0.718774,3009600.0,Healthcare,,,,,,,Winter,0.727132,2
8,1980-12-24,JNJ,0.718774,0.727132,0.715988,0.727132,2563200.0,Healthcare,,,,,,,Winter,0.742919,1
9,1980-12-26,JNJ,0.727132,0.742919,0.725275,0.742919,1296000.0,Healthcare,,,,,,,Winter,0.737347,2


In [7]:
company_data.tail(3)

Unnamed: 0,Date,Company,Open,High,Low,Close,Volume,Sector,EMA_50,EMA_200,RSI,MACD,MACD_Signal,MACD_Hist,Season,Close_1_days_ahead,Target
1425678,2024-06-14,FVX,4.205,4.245,4.196,4.226,0.0,Index,4.428591,4.290657,36.695882,-0.055544,-0.031539,-0.024005,Summer,4.3,2
1425679,2024-06-17,FVX,4.275,4.308,4.272,4.3,0.0,Index,4.423548,4.29075,42.454715,-0.056638,-0.036558,-0.020079,Summer,4.234,2
1425680,2024-06-18,FVX,4.308,4.312,4.224,4.234,0.0,Index,4.416115,4.290186,39.043233,-0.062115,-0.04167,-0.020445,Summer,4.266,2


In [8]:
company_data['Target'].unique()

array([2, 1, 0], dtype=int64)

In [9]:
company_data['Target'].value_counts()

Target
2    1175272
1     130945
0     119464
Name: count, dtype: int64

In [24]:
# Features and target
features = ['EMA_50', 'EMA_200', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist']
X = company_data[features]
y = company_data['Target']

In [25]:
# Handle missing values by filling with mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [26]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [27]:
# Define classifiers to compare for best model selection
classifiers = {
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=10000, class_weight='balanced'),
    'SVM': SVC(random_state=42, class_weight='balanced'),
    'XGBoost': xgb.XGBClassifier(random_state=42, scale_pos_weight=1)
}

In [None]:
# Use TimeSeriesSplit for time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

In [29]:
# Compare classifiers using time series cross-validation
best_classifier = None
best_score = 0
results = {}

for name, clf in classifiers.items():
    scores = []
    for train_index, test_index in tscv.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Oversample the minority classes (0 and 1) in the training set
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        clf.fit(X_train_resampled, y_train_resampled)
        y_pred = clf.predict(X_test)
        scores.append(accuracy_score(y_test, y_pred))

    avg_score = np.mean(scores)
    results[name] = avg_score
    if avg_score > best_score:
        best_score = avg_score
        best_classifier = clf

In [None]:
# Print results
print("\nModels & Accuracies:\n")
for name, score in results.items():
    print(f"{name}: {score:.4f}")

print(f"\nBest Classifier: {best_classifier.__class__.__name__} with score: {best_score:.4f}")


In [None]:
# Train the best classifier on the full training set
train_index, test_index = list(tscv.split(X_scaled))[-1]
X_train, X_test = X_scaled[train_index], X_scaled[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

best_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
# Predict the model on the last split
y_pred = best_classifier.predict(X_test)

In [None]:
# Evaluate the model
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)
accuracy_result = accuracy_score(y_test, y_pred)


In [None]:
# Print the evaluation metrics
print("\nClassification Report:\n", classification_report_result)
print("Confusion Matrix:\n", confusion_matrix_result)
print("Accuracy Score:", accuracy_result)

In [None]:
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', xticklabels=['Down', 'Up', 'Neutral'], yticklabels=['Down', 'Up', 'Neutral'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()