In [56]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score

In [57]:
# load raw UAE stock price data
fab_raw = pd.read_csv("FAB.csv")
taqa_raw=pd.read_csv("TAQA.csv")

# preview raw datasets
fab_raw.head()
taqa_raw.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,12/29/2023,3.5,3.35,3.5,3.34,83.71M,4.79%
1,12/28/2023,3.34,3.31,3.35,3.29,5.30M,1.21%
2,12/27/2023,3.3,3.3,3.33,3.3,3.71M,0.00%
3,12/26/2023,3.3,3.21,3.3,3.21,2.07M,2.48%
4,12/25/2023,3.22,3.23,3.26,3.2,5.31M,0.62%


In [58]:
# define a function that cleans and prepares UAE stock data
def prepuae(df,company, sector):

  # rename the price column
  df = df.rename(columns={"Price": "AdjClose"}).copy()

  # convert the date column to datatime format/style
  df["Date"] = pd.to_datetime(df["Date"])

  # remove rows with missing price or data values
  df = df.dropna(subset=["Date", "AdjClose"])

  # sort data in a chronological orger
  df["Return"] = df["AdjClose"].pct_change()

  # adding company, market, and sector labels
  df["Company"] = company
  df["Market"] = "ADX"
  df["Sector"] = sector

  # keep the only relevant columns
  return df[["Date", "Company", "Market", "Sector", "AdjClose", "Return"]]

In [59]:
# clean and prepare FAB and TAQA datasets
fab = prepuae(fab_raw, "FAB", "Banking")
taqa = prepuae(taqa_raw, "TAQA", "Energy")

# remove rows with NAs
fab = fab.dropna(subset=["Return"])
taqa = taqa.dropna(subset=["Return"])

# preview the cleaned version
fab.head()
taqa.head()

Unnamed: 0,Date,Company,Market,Sector,AdjClose,Return
1,2023-12-28,TAQA,ADX,Energy,3.34,-0.045714
2,2023-12-27,TAQA,ADX,Energy,3.3,-0.011976
3,2023-12-26,TAQA,ADX,Energy,3.3,0.0
4,2023-12-25,TAQA,ADX,Energy,3.22,-0.024242
5,2023-12-22,TAQA,ADX,Energy,3.2,-0.006211


In [60]:
%pip install yfinance


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [61]:
import yfinance as yf

In [62]:
us_symbols = ["JPM", "XOM"]

# download adjusted stock prices fro 2022-2023
raw_us = yf.download(
    us_symbols,
    start="2022-01-01",
    end="2023-12-31",
    auto_adjust=True,
    progress=False
)

In [63]:
# convert wide format data to long
us = raw_us["Close"].reset_index().melt(
    id_vars="Date",
    var_name="Company",
    value_name="AdjClose",
)

# sort data by data and company
us = us.sort_values(["Company", "Date"])

# calculate daily returns by company
us["Return"] = us.groupby("Company")["AdjClose"].pct_change()

# add market and sector labels
us["Market"] = "US"
us["Sector"] = us["Company"].map({"JPM": "Banking", "XOM": "Energy"})

# remove rows with NAs
us = us.dropna(subset=["AdjClose", "Return"])

# keep only relevant columns
us = us[["Date", "Company", "Market", "Sector", "AdjClose", "Return"]]

# previw cleaned U.S. data
us.head()

Unnamed: 0,Date,Company,Market,Sector,AdjClose,Return
1,2022-01-04,JPM,US,Banking,151.102249,0.03791
2,2022-01-05,JPM,US,Banking,148.339798,-0.018282
3,2022-01-06,JPM,US,Banking,149.915787,0.010624
4,2022-01-07,JPM,US,Banking,151.401184,0.009908
5,2022-01-10,JPM,US,Banking,151.546097,0.000957


In [64]:
# check data types
us.dtypes

Date        datetime64[ns]
Company             object
Market              object
Sector              object
AdjClose           float64
Return             float64
dtype: object

In [65]:
# merge all cleaned data
# combine cleaned UAE and US datasets
full = pd.concat([fab, taqa, us], ignore_index=True)

# sort combined data by company and date
full = full.sort_values(["Company", "Date"]).reset_index(drop=True)

# preview merged dataset
full.head()

Unnamed: 0,Date,Company,Market,Sector,AdjClose,Return
0,2022-01-03,FAB,ADX,Banking,18.88,0.004255
1,2022-01-04,FAB,ADX,Banking,18.8,0.006424
2,2022-01-05,FAB,ADX,Banking,18.68,-0.003202
3,2022-01-06,FAB,ADX,Banking,18.74,0.010787
4,2022-01-07,FAB,ADX,Banking,18.54,-0.010672


In [66]:
# check dataset structure
full.dtypes
full["Company"].value_counts()

Company
FAB     501
TAQA    501
JPM     500
XOM     500
Name: count, dtype: int64

In [67]:
# The UAE (ADX) and the U.S.(NYSE) do not have the same days of holidays and some days the U.S. markets are closed when UAE markets operate so the slightly different number od trading days is normal.

In [68]:
# create classification target
# create the binary target vairable indicating up or down trading day
full["UpDay"] = (full["Return"] > 0).astype(int)

# check  the class distribution
full["UpDay"].value_counts(normalize=True)

UpDay
0    0.507493
1    0.492507
Name: proportion, dtype: float64

In [69]:
# create lagged return features by company
full["Return_lag1"] = full.groupby("Company")["Return"].shift(1)
full["Return_lag2"] = full.groupby("Company")["Return"].shift(2)
full["Return_lag3"] = full.groupby("Company")["Return"].shift(3)

In [70]:
#computation of rolling volatility and rolling mean returns
full["RollingVol_5"] = (
    full.groupby("Company")["Return"]
    .rolling(window=5)
    .std()
    .reset_index(level=0, drop=True)
)

full["RollingMean_5"] = (
    full.groupby("Company")["Return"]
    .rolling(window=5)
    .mean()
    .reset_index(level=0, drop=True)
)


In [71]:
# remove rows with missing values
full = full.dropna().reset_index(drop=True)

# check shape
full.shape

(1986, 12)

In [72]:
# encoding categorical variables

# convert categorical to numberic with one-hot encoding
full_encoded = pd.get_dummies(
    full,
    columns=["Market", "Sector", "Company"],
    drop_first=True
)
# preview encoded dataset
full_encoded.head()

Unnamed: 0,Date,AdjClose,Return,UpDay,Return_lag1,Return_lag2,Return_lag3,RollingVol_5,RollingMean_5,Market_US,Sector_Energy,Company_JPM,Company_TAQA,Company_XOM
0,2022-01-07,18.54,-0.010672,0,0.010787,-0.003202,0.006424,0.008491,0.001519,False,False,False,False,False
1,2022-01-10,18.74,-0.011603,0,-0.010672,0.010787,-0.003202,0.010034,-0.001653,False,False,False,False,False
2,2022-01-11,18.96,-0.002105,0,-0.011603,-0.010672,0.010787,0.008989,-0.003359,False,False,False,False,False
3,2022-01-12,19.0,-0.01758,0,-0.002105,-0.011603,-0.010672,0.011,-0.006235,False,False,False,False,False
4,2022-01-13,19.34,0.0,0,-0.01758,-0.002105,-0.011603,0.007244,-0.008392,False,False,False,False,False


In [73]:
# define feature matrix and target var
X = full_encoded.drop(columns=["Date", "AdjClose", "Return", "UpDay"])
y = full_encoded["UpDay"]

X.shape, y.shape

((1986, 10), (1986,))

In [74]:
# split data into training and testing
split_date = "2023-07-01"

train = full_encoded["Date"] < split_date
test = full_encoded["Date"] >= split_date

In [75]:
# split feature matrix and target
X_train = X.loc[train]
X_test = X.loc[test]
y_train = y.loc[train]
y_test = y.loc[test]

# check shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1482, 10), (504, 10), (1482,), (504,))

In [76]:
# Scaling for Logistic Regression and KNN

from sklearn.preprocessing import StandardScaler

# initialize scaler
scaler = StandardScaler()

# fit scaler on training data only
X_train_scaled = scaler.fit_transform(X_train)

# apply scaler to test data
X_test_scaled = scaler.transform(X_test)

In [77]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# train the logistic regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# predictions
y_predlog = log_reg.predict(X_test_scaled)

# evaluate
accuracy_score(y_test, y_predlog)

print (f"The accuracy score is: {accuracy_score(y_test, y_predlog):.3}")
print(f"Precision is: {precision_score(y_test, y_predlog):.3}")
print(f"Recall is: {recall_score(y_test, y_predlog):.3}")


The accuracy score is: 0.74
Precision is: 0.787
Recall is: 0.659


In [78]:
# K-Nearest Neighbors (KNN)

from sklearn.neighbors import KNeighborsClassifier

# initialize KNN
knn = KNeighborsClassifier(n_neighbors=20)

# train model
knn.fit(X_train_scaled, y_train)

# predictions
y_predknn = knn.predict(X_test_scaled)

# evaluate
accuracy_score(y_test, y_predknn)

print (f"The accuracy score is: {accuracy_score(y_test, y_predknn):.3}")
print (f"Precision is: {precision_score(y_test, y_predknn):.3}")
print (f"Recall is: {recall_score(y_test, y_predknn):.3}")


The accuracy score is: 0.627
Precision is: 0.639
Recall is: 0.583


In [79]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# initialize decision tree
dt = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf= 20,
    random_state=42
)

# train model
dt.fit(X_train, y_train)

# predictions
y_preddt = dt.predict(X_test)

# evaluation
accuracy_score(y_test, y_preddt)

print (f"The accuracy score is: {accuracy_score(y_test, y_preddt):.3}")
print(f"Precision is: {precision_score(y_test, y_preddt):.3}")
print(f"Recall is: {recall_score(y_test, y_preddt):.3}")

The accuracy score is: 0.673
Precision is: 0.714
Recall is: 0.575


In [80]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# initialize random forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
)

# train model
rf.fit(X_train, y_train)

# predictions
y_predrf = rf.predict(X_test)

# evaluation
accuracy_score(y_test, y_predrf)

print (f"The accuracy score is: {accuracy_score(y_test, y_predrf):.3}")
print(f"Precision is: {precision_score(y_test, y_predrf):.3}")
print(f"Recall is: {recall_score(y_test, y_predrf):.3}")

The accuracy score is: 0.663
Precision is: 0.723
Recall is: 0.528


In [81]:
#Gradient boosting classifier
gb = GradientBoostingClassifier(
  n_estimators= 300,
  learning_rate= 0.05,
  max_depth= 3,
  random_state=42
)
#Fit on train dataset
gb.fit(X_train, y_train)
#Predict on test dataset
y_predgb = gb.predict(X_test)

gb_accuracy = accuracy_score(y_test, y_predgb)
print(f"The accuracy score is: {gb_accuracy:.3}")
print(f"Precision is: {precision_score(y_test, y_predgb):.3}")
print(f"Recall is: {recall_score(y_test, y_predgb):.3}")


The accuracy score is: 0.704
Precision is: 0.715
Recall is: 0.679


In [82]:
#Neural network

Xtr = torch.tensor(X_train_scaled, dtype = torch.float32)
Xte = torch.tensor(X_test_scaled, dtype = torch.float32)
ytr = torch.tensor(y_train.values, dtype = torch.float32).view(-1, 1)
yte = torch.tensor(y_test.values, dtype = torch.float32).view(-1, 1)

train_loader = DataLoader(TensorDataset(Xtr, ytr), batch_size= 64, shuffle = True)
test_loader = DataLoader(TensorDataset(Xte, yte), batch_size = 256, shuffle = False)

#Define the model
input_dim =Xtr.shape[1]
model = nn.Sequential(
  nn.Linear(input_dim, 64),
  nn.ReLU(),
  nn.Linear(64, 32),
  nn.ReLU(),
  nn.Linear(32, 1),
)
#Loss and optimizer
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05)

#Training loop
losses = []
for epoch in range(200):
  yhat = model(Xtr)
  loss = loss_fn(yhat, ytr)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  losses.append(loss.item())
 #Evaluate on test
model.eval()
with torch.no_grad():
  logits = model(Xte)
  probs = torch.sigmoid(logits).numpy().ravel()
  preds = (probs>=0.5).astype(int)

test_acc = accuracy_score(y_test, preds)

print("Neural Network Results")
print("Accuracy", round(test_acc, 3))




Neural Network Results
Accuracy 0.595


In [84]:
# Model Performance by Market (UAE vs US)

# build set dataframe for evaluation by market
test_df = full_encoded.loc[test].copy()
test_df["y_true"] = y_test.values

# using random forest
test_df["y_pred"] = y_predrf

# reconstruct market label
test_df["Market"] = np.where(test_df["Market_US"] == 1, "US", "UAE")

# compute accuracy by market
market_accuracy = (
    test_df
    .groupby("Market")
    .apply(lambda df: (df["y_pred"] == df["y_true"]).mean())
)

market_accuracy

  .apply(lambda df: (df["y_pred"] == df["y_true"]).mean())


Market
UAE    0.634921
US     0.690476
dtype: float64

In [85]:
from sklearn.metrics import confusion_matrix

cm_uae = confusion_matrix(
    test_df.loc[test_df["Market"] == "UAE", "y_true"],
    test_df.loc[test_df["Market"] == "UAE", "y_pred"]
)

cm_us = confusion_matrix(
    test_df.loc[test_df["Market"] == "US", "y_true"],
    test_df.loc[test_df["Market"] == "US", "y_pred"]
)

cm_uae, cm_us

(array([[111,  28],
        [ 64,  49]]),
 array([[90, 23],
        [55, 84]]))

In [92]:
import pandas as pd
def metrics_from_cm(cm):
  TN, FP, FN, TP = cm.ravel()

  accuracy = (TP + TN) / cm.sum()
  precision = TP / (TP + FP) if (TP + FP) > 0 else 0
  recall = TP / (TP + FN) if (TP + FN) > 0 else 0
  f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
  return [accuracy, precision, recall, f1]


In [93]:
uae_metrics = metrics_from_cm(cm_uae)
us_metrics = metrics_from_cm(cm_us)

metrics_df = pd.DataFrame(
    [uae_metrics, us_metrics],
    columns=["Accuracy", "Precision (High Vol)", "Recall (High Vol)", "F1-score"],
    index=["UAE (ADX)", "US (NYUSE)"]
)

metrics_df

Unnamed: 0,Accuracy,Precision (High Vol),Recall (High Vol),F1-score
UAE (ADX),0.634921,0.636364,0.433628,0.515789
US (NYUSE),0.690476,0.785047,0.604317,0.682927


In [83]:
#Summary of all results obtained
model_results = pd.DataFrame({
    "Model": ["Logistic Regression", "KNN", "Decision Tree", "Random Forest", "Gradient Boosting", "Neural Network"],
    "Accuracy": [
        accuracy_score(y_test, y_predlog),
        accuracy_score(y_test, y_predknn),
        accuracy_score(y_test, y_preddt),
        accuracy_score(y_test, y_predrf),
        gb_accuracy,
        test_acc
    ]
})

model_results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.740079
1,KNN,0.626984
2,Decision Tree,0.672619
3,Random Forest,0.662698
4,Gradient Boosting,0.704365
5,Neural Network,0.595238
