In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
# data import
data = pd.read_csv("dataset.csv")
print(data)

        Year   Lag1   Lag2   Lag3   Lag4   Lag5   Volume  Today Direction
0     2001.0  0.381 -0.192 -2.624 -1.055  5.010  1.19130  0.959        Up
1     2001.0  0.959  0.381 -0.192 -2.624 -1.055  1.29650  1.032        Up
2     2001.0  1.032  0.959  0.381 -0.192 -2.624  1.41120 -0.623      Down
3     2001.0 -0.623  1.032  0.959  0.381 -0.192  1.27600  0.614        Up
4     2001.0  0.614 -0.623  1.032  0.959  0.381  1.20570  0.213        Up
...      ...    ...    ...    ...    ...    ...      ...    ...       ...
1250  2005.0  0.422  0.252 -0.024 -0.584 -0.285  1.88850  0.043        Up
1251  2005.0  0.043  0.422  0.252 -0.024 -0.584  1.28581 -0.955      Down
1252  2005.0 -0.955  0.043  0.422  0.252 -0.024  1.54047  0.130        Up
1253  2005.0  0.130 -0.955  0.043  0.422  0.252  1.42236 -0.298      Down
1254  2005.0 -0.298  0.130 -0.955  0.043  0.422  1.38254 -0.489      Down

[1255 rows x 9 columns]


In [3]:
#data cleaning
null_value = data.isnull().sum()
print("--Null values--")
print(null_value)
print(data[data.isnull().any(axis=1)])

--Null values--
Year         1
Lag1         0
Lag2         2
Lag3         0
Lag4         1
Lag5         0
Volume       2
Today        0
Direction    2
dtype: int64
        Year  Lag1  Lag2   Lag3   Lag4   Lag5  Volume  Today Direction
626   2003.0  0.34   NaN -0.810  1.164  0.802     NaN -0.559       NaN
1168     NaN -0.60   NaN -0.657    NaN  0.166     NaN  0.596       NaN


In [4]:
duplicated_values = data.duplicated().sum()
print("--Duplicated values--")
print(duplicated_values)
print(data[data.duplicated(keep=False)])

--Duplicated values--
3
       Year   Lag1   Lag2   Lag3   Lag4   Lag5   Volume  Today Direction
430  2002.0 -1.461 -3.226  1.821  2.486 -1.728  1.72187  4.002        Up
431  2002.0 -1.461 -3.226  1.821  2.486 -1.728  1.72187  4.002        Up
498  2003.0  2.247 -0.048  3.320  0.049  0.456  1.43590 -0.654      Down
499  2003.0  2.247 -0.048  3.320  0.049  0.456  1.43590 -0.654      Down
799  2004.0 -1.435  1.246 -1.522 -1.463 -0.577  1.60060  0.562        Up
800  2004.0 -1.435  1.246 -1.522 -1.463 -0.577  1.60060  0.562        Up


In [5]:
data = data.dropna()
data = data.drop_duplicates()
print(data.shape)
print(data.describe())

(1250, 9)
              Year         Lag1         Lag2         Lag3         Lag4  \
count  1250.000000  1250.000000  1250.000000  1250.000000  1250.000000   
mean   2003.016000     0.003834     0.003919     0.001716     0.001636   
std       1.409018     1.136299     1.136280     1.138703     1.138774   
min    2001.000000    -4.922000    -4.922000    -4.922000    -4.922000   
25%    2002.000000    -0.639500    -0.639500    -0.640000    -0.640000   
50%    2003.000000     0.039000     0.039000     0.038500     0.038500   
75%    2004.000000     0.596750     0.596750     0.596750     0.596750   
max    2005.000000     5.733000     5.733000     5.733000     5.733000   

             Lag5       Volume        Today  
count  1250.00000  1250.000000  1250.000000  
mean      0.00561     1.478305     0.003138  
std       1.14755     0.360357     1.136334  
min      -4.92200     0.356070    -4.922000  
25%      -0.64000     1.257400    -0.639500  
50%       0.03850     1.422950     0.038500  
7

In [6]:
#label encoding
# https://www.geeksforgeeks.org/ml-label-encoding-of-datasets-in-python/

print(data['Direction'].unique())

label_encoder = LabelEncoder()
data['Direction']= label_encoder.fit_transform(data['Direction'])

print(data['Direction'].unique())
print(data.head(5))

['Up' 'Down']
[1 0]
     Year   Lag1   Lag2   Lag3   Lag4   Lag5  Volume  Today  Direction
0  2001.0  0.381 -0.192 -2.624 -1.055  5.010  1.1913  0.959          1
1  2001.0  0.959  0.381 -0.192 -2.624 -1.055  1.2965  1.032          1
2  2001.0  1.032  0.959  0.381 -0.192 -2.624  1.4112 -0.623          0
3  2001.0 -0.623  1.032  0.959  0.381 -0.192  1.2760  0.614          1
4  2001.0  0.614 -0.623  1.032  0.959  0.381  1.2057  0.213          1


In [7]:
# Define features and target
X = data.drop(columns=['Year', 'Today', 'Direction'])
y = data['Direction']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
print(X_train)

       Lag1   Lag2   Lag3   Lag4   Lag5   Volume
646  -1.766  0.272 -1.026  0.286 -0.181  1.35170
526  -0.160 -1.269 -0.810  0.757 -1.009  1.48930
1096  0.696  1.009 -0.458 -1.003  0.419  1.88726
193   0.456 -0.787 -1.863  0.694 -0.153  1.29490
997   0.046  0.342  0.904  0.038 -0.749  0.95610
...     ...    ...    ...    ...    ...      ...
73    1.594 -1.216 -1.498 -0.854  1.254  1.20360
1146  0.560  0.457  0.173 -0.377  0.541  2.00168
1002 -0.134  0.008 -0.007  0.715 -0.431  0.78690
206  -0.273  1.453  1.439  0.286  2.295  1.41130
871  -0.545 -0.298  0.851  0.364 -0.416  1.81290

[875 rows x 6 columns]


In [8]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train)

[[-1.52328442e+00  2.42179750e-01 -9.04420250e-01  2.54921136e-01
  -1.65844750e-01 -3.20590041e-01]
 [-8.80620384e-02 -1.11012491e+00 -7.12981760e-01  6.70989623e-01
  -9.04575286e-01  5.74628455e-02]
 [ 6.76913279e-01  8.88934155e-01 -4.01007926e-01 -8.83746124e-01
   3.69467232e-01  1.15084894e+00]
 ...
 [-6.48268068e-02  1.05065309e-02 -1.29145013e-03  6.33887975e-01
  -3.88891410e-01 -1.87236526e+00]
 [-1.89045930e-01  1.27856639e+00  1.28028288e+00  2.54921136e-01
   2.04320936e+00 -1.56840389e-01]
 [-4.32122199e-01 -2.58023792e-01  7.59144772e-01  3.23824198e-01
  -3.75508610e-01  9.46546523e-01]]


In [9]:
# Fit the Logistic Regression model using the training dataset.
# Print the accuracy of the model.
# Print the classification report including precision, recall, and F1-score.

#ref
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report


log_regress = LogisticRegression()
log_regress.fit(X_train,y_train)
# print(log_regress.intercept_)
# print(log_regress.coef_)

y_pred_log = log_regress.predict(X_test)
# print(y_pred_log)

log_accuracy = log_regress.score(X_test,y_test)
print(f"Logistic Regression Accuracy : {100*log_accuracy:.3f}%")

print("--Report--")
log_report = classification_report(y_test,y_pred_log,target_names=['Down(0)','Up(1)'])
log_con = confusion_matrix(y_test, y_pred_log)
print(log_con)
print(log_report)

Logistic Regression Accuracy : 52.533%
--Report--
[[ 30 154]
 [ 24 167]]
              precision    recall  f1-score   support

     Down(0)       0.56      0.16      0.25       184
       Up(1)       0.52      0.87      0.65       191

    accuracy                           0.53       375
   macro avg       0.54      0.52      0.45       375
weighted avg       0.54      0.53      0.46       375



In [10]:
# Fit the SVM model using the training dataset.
# Print the accuracy of the model.
# Print the classification report including precision, recall, and F1-score.


# Create and train the SVM model
svc = SVC(kernel='sigmoid', C=10, class_weight='balanced')
svc.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)

svm_accuracy = svc.score(X_test,y_test)
print(f"SVM Accuracy : {100*svm_accuracy:.3f}%")

print("--Report--")
svm_report = classification_report(y_test,y_pred_svc,target_names=['Down(0)','Up(1)'])
svm_con = confusion_matrix(y_test, y_pred_svc)
print(svm_con)
print(svm_report)

SVM Accuracy : 57.600%
--Report--
[[ 99  85]
 [ 74 117]]
              precision    recall  f1-score   support

     Down(0)       0.57      0.54      0.55       184
       Up(1)       0.58      0.61      0.60       191

    accuracy                           0.58       375
   macro avg       0.58      0.58      0.58       375
weighted avg       0.58      0.58      0.58       375



In [11]:

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7) 
knn.fit(X_train,y_train)

y_pred_knn = knn.predict(X_test)
# print(y_pred_knn)

knn_accuracy = knn.score(X_test,y_test)
print(f"K Nearest Neighbor Accuracy : {100*knn_accuracy:.3f}%")

print("--Report--")
knn_report = classification_report(y_test,y_pred_knn,target_names=['Down(0)','Up(1)'])
knn_con = confusion_matrix(y_test, y_pred_knn)
print(knn_con)
print(knn_report)

K Nearest Neighbor Accuracy : 52.000%
--Report--
[[ 83 101]
 [ 79 112]]
              precision    recall  f1-score   support

     Down(0)       0.51      0.45      0.48       184
       Up(1)       0.53      0.59      0.55       191

    accuracy                           0.52       375
   macro avg       0.52      0.52      0.52       375
weighted avg       0.52      0.52      0.52       375

