In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

#basline model
from sklearn.dummy import DummyRegressor

# This is for regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

# This is for Pipline
from sklearn.pipeline import Pipeline 
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# we used for the Grid search model
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV

### Read the dataset

In [10]:
tadawul_stcks = pd.read_csv('Tadawul_stcks_clean.csv')

In [11]:
# look frist five rows 
tadawul_stcks.head()

Unnamed: 0,trading_name,sector,date,open_price,high_price,low_price,close_price,change,perc_Change,volume_traded,value_traded,num_trades,Year,month,day,Change_category
0,SARCO,Energy,2020-03-05,35.55,35.85,34.9,34.9,-0.4,-1.13,436609.0,15399073.5,804.0,2020,March,Thursday,Bad Change
1,SARCO,Energy,2020-03-04,34.7,35.65,34.5,35.3,0.25,0.71,737624.0,25981391.35,1268.0,2020,March,Wednesday,Good Change
2,SARCO,Energy,2020-03-03,34.7,35.15,34.7,35.05,1.05,3.09,489831.0,17116413.4,854.0,2020,March,Tuesday,Good Change
3,SARCO,Energy,2020-03-02,35.2,35.65,34.0,34.0,-0.55,-1.59,736157.0,25858700.6,1242.0,2020,March,Monday,Bad Change
4,SARCO,Energy,2020-03-01,35.35,35.6,34.25,34.55,-2.05,-5.6,738685.0,25747967.55,1625.0,2020,March,Sunday,Bad Change


# ML model

## Select the feature and target

In [12]:
target = 'close_price' # Target Varible
features = ['open_price','low_price','change']


X = tadawul_stcks[features]
y = tadawul_stcks[target]

## Standard Scaler

In [13]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(X)

## Split the data

In [14]:
# split data into train and test 
# select random state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Baseline model

In [15]:
# we will use the DummyRegressor model for the baseline
baseline_model = DummyRegressor()
baseline_model.fit(X_train,y_train)
baseline_model_pred =  baseline_model.predict(X_test)

print(f"baseline model score: {r2_score(y_test, baseline_model_pred)}")

baseline model score: -1.33849496373184e-05


In [16]:
# Cost Function for baseline model
Adj_r2 = 1 - (1-r2_score(y_test, baseline_model_pred)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("R Squared:",r2_score(y_test, baseline_model_pred))
print("MSE:",mean_squared_error(y_test, baseline_model_pred))
print("MAE:",mean_absolute_error(y_test, baseline_model_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test, baseline_model_pred)))
print("Adjusted R Squared:",Adj_r2)

R Squared: -1.33849496373184e-05
MSE: 7217.246782384993
MAE: 28.401060673463085
RMSE: 84.95438059561728
Adjusted R Squared: -3.409573535839705e-05


#### The output:
#### the accuracy of basline model is low and the error it's very high . So all Train model should be better than the basline model.


# Train Model

### 1st Liner Regression Model 

In [17]:
#create the model
liner = LinearRegression()
# fit the model using X train and y train
liner.fit(X_train , y_train)
# using X test to make our predication
linear_pred = liner.predict(X_test)

### Train and Test score

In [18]:
print(f"Test score: {liner.score(X_test, y_test)}")
print(f"Train score: {liner.score(X_train, y_train)}")

Test score: 0.999605900045618
Train score: 0.9993704661079523


# --------------------------------------------------------------------------------------

### 2nd Random Forest regression

In [19]:
# create the random model
# set the num of estimators = 10
# max depth is 4 
random_reg = RandomForestRegressor(n_estimators = 10, max_depth = 4, criterion = 'mse')
# fit the model using X train and Y train
random_reg.fit(X_train ,y_train)
# make our predication using X test
random_predict =random_reg.predict(X_test)


### Train and Test Score

In [20]:
print(f"Test score: {random_reg.score(X_test, y_test)}")
print(f"Train score: {random_reg.score(X_train, y_train)}")

Test score: 0.9877141224669737
Train score: 0.9876510005758788


# ---------------------------------------------------------------------------------------

### 3rd KNN  Regression model

In [21]:
# Creat the KNN model
knn_reg = KNeighborsRegressor()
# use X train and y train
knn_reg.fit(X_train,y_train)
# make our predication using X test
preds_knn_reg = knn_reg.predict(X_test)


### Train and Test Score

In [22]:
print(f"Test score: {knn_reg.score(X_test, y_test)}")
print(f"Train score: {knn_reg.score(X_train, y_train)}")

Test score: 0.9995150671521293
Train score: 0.9992696578602894


# ----------------------------------------------------------------------------------------

### 4th GBR  Regression model

In [23]:
gbr_reg = GradientBoostingRegressor()
# fit the model
gbr_reg.fit(X_train,y_train)
# make our predication using X test
preds_gbr_reg = gbr_reg.predict(X_test)

## Train and test score

In [24]:
print(f"Test score: {gbr_reg.score(X_test, y_test)}")
print(f"Train score: {gbr_reg.score(X_train, y_train)}")

Test score: 0.9992229162311949
Train score: 0.999618349600064


# -----------------------------------------------------------------------------------------

### 5th XGB Regression Model

In [25]:
# create the model
xgb_reg = xgb.XGBRegressor(objective = "reg:linear",
                           n_estimators = 75,
                           subsample = 0.75,
                           max_depth = 7)
# fit the model using X train , y train
xgb_reg.fit(X_train , y_train)
#make our predication using X test
xgb_pred =xgb_reg.predict(X_test)



In [26]:
print(f"Test score: {xgb_reg.score(X_test, y_test)}")
print(f"Train score: {xgb_reg.score(X_train, y_train)}")

Test score: 0.9991932701703619
Train score: 0.9999117619114631


# ----------------------------------------------------------------------------------------

# Model Evaluation



### Create the Cost Function

####  This include  MSE, MAE,RMSE, R2  

In [27]:
# Create cost function that display all the cost functions for the regression models
def cost_function(pred):
    Adj_r2 = 1 - (1-r2_score(y_test, pred)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    print("R Squared:",r2_score(y_test, pred))
    print("MSE:",mean_squared_error(y_test, pred))
    print("MAE:",mean_absolute_error(y_test, pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, pred)))
    print("Adjusted R Squared:",Adj_r2)
    

###  1st Model  Evaluation - Liner Regression Model

In [28]:
## Call the funcation
cost_function(linear_pred)

R Squared: 0.999605900045618
MSE: 2.844278557176044
MAE: 0.4264164928631083
RMSE: 1.6864989051808021
Adjusted R Squared: 0.9996058918836075


#### The output:
#### the accuracy of this model is 99 % which is high and better than the baseline accuracy
#### MSE ,  MAE  and RMSE error it is very low 


# ----------------------------------------------------------------------------------------

###  2nd Model  Evaluation - Random Forest Model

In [29]:
## Call the funcation
cost_function(random_predict)

R Squared: 0.9877141224669737
MSE: 88.66902326359956
MAE: 5.829261752002699
RMSE: 9.416423060992935
Adjusted R Squared: 0.9877138680202024


#### the output:
#### the acuraccy of Random forest is 98 which is better than the baseline Model and less than the liner regression model. 
#### MSE , MAE and RMSE are high than liner regression with value 84 and the 5.1 , 9.3

# ----------------------------------------------------------------------------------------

###  3rd Model  Evaluation -KNN Model

In [30]:
## Call the funcation
cost_function(preds_knn_reg)


R Squared: 0.9995150671521293
MSE: 3.4998331908765845
MAE: 0.30185866158582897
RMSE: 1.8707841112422845
Adjusted R Squared: 0.9995150571089235


#### The output :
#### The accuracy of KNN model is 99 % which is good accuracy and better than the baseline model and Random forest. Also , MSE ,  MAE and RMSE errors are lower than the random forest .

# ----------------------------------------------------------------------------------------

###  4th Model  Evaluation -GBR Model

In [31]:
## Call the funcation
cost_function(preds_gbr_reg)

R Squared: 0.9992229162311949
MSE: 5.608330262834369
MAE: 0.671718863606776
RMSE: 2.368191348441753
Adjusted R Squared: 0.9992229001373949


#### The output:
#### The accuracy score of the GBR model is 99 which is better than the baseline and the same KNN  and linear regression model. Also, MSE , MAE, and RMSE errors are a littel increase than the KNN model

# ----------------------------------------------------------------------------------------

###  5th Model  Evaluation -XGB Model

In [32]:
## Call the funcation
cost_function(xgb_pred)


R Squared: 0.9991932701703619
MSE: 5.822290336146822
MAE: 0.3990513375335785
RMSE: 2.412942257109942
Adjusted R Squared: 0.9991932534625768


#### The accuracy score of the XGB model is 99 which is better than the baseline model and the same all models expect random forest  also the MSE, MAE, and RSME error low 

##  Summary Results

In [33]:
# create a data frame to combine all model results
df_perf_metrics = pd.DataFrame(columns=['Model','R2','MSE', 'MAE', 'RMSE'])# set the column of data frame
model_name = ['Baseline model', 'Linear Regression',  'Random Forest','KNN' , 'GBR' , 'XGB']
model_pred = [baseline_model_pred, linear_pred, random_predict, preds_knn_reg ,preds_gbr_reg ,xgb_pred]

def get_perf_metrics(model, n):# create function
    Adj_r2 = 1 - (1-r2_score(y_test, model_pred[n])) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    
    df_perf_metrics.loc[n] = [
        model_name[n],
        r2_score(y_test, model_pred[n]),
        mean_squared_error(y_test, model_pred[n]),
        mean_absolute_error(y_test, model_pred[n]),
        np.sqrt(mean_squared_error(y_test, model_pred[n])),
       # Adj_r2
    ]
        
                
for n, model in enumerate(model_name):
    get_perf_metrics(model, n)

In [34]:
# call the function
df_perf_metrics

Unnamed: 0,Model,R2,MSE,MAE,RMSE
0,Baseline model,-1.3e-05,7217.246782,28.401061,84.954381
1,Linear Regression,0.999606,2.844279,0.426416,1.686499
2,Random Forest,0.987714,88.669023,5.829262,9.416423
3,KNN,0.999515,3.499833,0.301859,1.870784
4,GBR,0.999223,5.60833,0.671719,2.368191
5,XGB,0.999193,5.82229,0.399051,2.412942


# ---------------------------------------------------------------------------------------

# Model Selection 
#### After we created five regression models ( Linear, Random Forest, KNN, XGP, GBR ),  all these model values are positive.So, the regression model is better than the Baseline model
#### We selected  Linear regression Model because give as the best Accuracy and  Error 
#### R Squared: 0.9996059 and  RMSE: 1.6864989051808341

# ----------------------------------------------------------------------------------------

## Model Optimization - Hyperparameter Tuning 

## Linear Regression Model - Best Model

In [46]:
param_linear = {"fit_intercept": [True, False]}

random_linear_reg = RandomizedSearchCV(liner,# set the model
                                   param_linear,# set the parameter
                                   scoring='r2',
                                   verbose=1,
                                   n_jobs=-1)

# fit the model
random_linear_reg.fit(X_train, y_train)
# make the predication
random = random_linear_reg.predict(X_test)



Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [40]:
## Call the funcation
cost_function(random)

R Squared: 0.999605900045618
MSE: 2.844278557176044
MAE: 0.4264164928631083
RMSE: 1.6864989051808021
Adjusted R Squared: 0.9996058918836075


#### The accuracy score and the MSE , MAE , RMSE same as Linear regression nothing change

##  Summary Results after tuning

In [41]:
# create a data frame to combine all model results
df_perf_metrics = pd.DataFrame(columns=['Model','R2','MSE', 'MAE', 'RMSE'])# set the column of data frame
model_name = ['Baseline model', 'Linear Regression','Linear Regression with tuning', 'Random Forest','KNN' , 'GBR' , 'XGB']
model_pred = [baseline_model_pred, linear_pred, random, random_predict, preds_knn_reg ,preds_gbr_reg ,xgb_pred]

def get_perf_metrics(model, n):# create function
    Adj_r2 = 1 - (1-r2_score(y_test, model_pred[n])) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    
    df_perf_metrics.loc[n] = [
        model_name[n],
        r2_score(y_test, model_pred[n]),
        mean_squared_error(y_test, model_pred[n]),
        mean_absolute_error(y_test, model_pred[n]),
        np.sqrt(mean_squared_error(y_test, model_pred[n])),
       # Adj_r2
    ]
        
                
for n, model in enumerate(model_name):
    get_perf_metrics(model, n)

In [42]:
# call the function
df_perf_metrics

Unnamed: 0,Model,R2,MSE,MAE,RMSE
0,Baseline model,-1.3e-05,7217.246782,28.401061,84.954381
1,Linear Regression,0.999606,2.844279,0.426416,1.686499
2,Linear Regression with tuning,0.999606,2.844279,0.426416,1.686499
3,Random Forest,0.987714,88.669023,5.829262,9.416423
4,KNN,0.999515,3.499833,0.301859,1.870784
5,GBR,0.999223,5.60833,0.671719,2.368191
6,XGB,0.999193,5.82229,0.399051,2.412942


# ---------------------------------------------------------------------------------------

# ML Pipeline for Best Model - Linear Regression

In [26]:
numeric_features = X_train.describe().columns # Select the numrical feature



#Create Transformer for numerical data
numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('scaler', StandardScaler())
    ]
)



# Create a preprocessor transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        
    ]
)
# create the pipline
liner_reg_pp = Pipeline(
    steps=[
        ('preprocessor', preprocessor),# set the preprocessor
        
        ('reg',LinearRegression())# Create the Liner regression model
    ]
)

## Fit the model

In [28]:
liner_reg_pp.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['open_price', 'low_price', 'change'], dtype='object'))])),
                ('reg', LinearRegression())])

In [29]:
# The predication X test feature
pipline_predict=liner_reg_pp.predict(X_test)

### Model Evaluation

In [30]:
## Call the funcation
cost_function(pipline_predict)

R Squared: 0.999605900045618
MSE: 2.8442785571761515
MAE: 0.4264164928631796
RMSE: 1.686498905180834
Adjusted R Squared: 0.9996058918836075


### Summary of our finding:

We have tried several regression models to come up with the best model that can predict the closing price of stucks with the highest score and the least possible error.

We found that the linear regression is the best model, as it achieved the highest score equal to 0.9996, and a low error in error measures, as it achieved 2.8 MSE, 0.42 MSA, and 1.6 RMSE. It is followed by KNN, which achieved a score of 0.9995 and 1.8 RMSE. Followed by the GBR and XGB, which achieved a score of 0.9992 and RMSE 2.3, the worst model was the Random Forest with 0.9877 score and 9.4 RMSE.

We select the Linear regression as the best model based on the lowest value in the Root Mean Squared Error (RMSE).