In [71]:
import pandas as pd
import numpy as np
from collections import Counter

In [72]:
df = pd.read_csv("Weekly.csv")

In [73]:
df.shape

(1089, 9)

# pre-proccessing data


###### change text value in direction column with numeric {0, 1} values

In [74]:
# Define function to : Iterate through the text column and apply the mapping
def text_mapping_to_num(mapping, data_row, data, default_value):
    for index, row in data.iterrows():
        text_value = row[data_row]
        if text_value in mapping:
            data.at[index, data_row] = mapping[text_value]

In [75]:
#mapping dictionary to map up and down to 1 , 0
direction_mapping = {"Down": 0, "Up": 1}

In [76]:
text_mapping_to_num(direction_mapping, 'Direction', df, 0)

In [77]:
df.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,0
1,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,0
2,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,1
3,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,1
4,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,1


##### Define X, y and test and train data

In [78]:
# Define the percentage of data to use for training (80%)
train_percentage = 0.8

# Calculate the number of samples for training and testing
samples = len(df)
num_of_train = int(train_percentage * samples)
num_of_test = samples - num_of_train

In [79]:
df.sample(frac = 1)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
848,2006,-1.875,-2.604,1.156,-0.051,1.719,2.648372,1.036,1
229,1994,0.768,-3.414,-0.048,-0.317,0.612,0.254938,0.751,1
464,1999,0.241,3.219,1.849,-0.874,-1.308,0.607868,3.731,1
853,2006,-0.563,-0.061,-2.788,0.630,1.036,2.255366,2.065,1
350,1996,-1.393,1.450,-0.114,2.225,-0.122,0.410840,0.407,1
...,...,...,...,...,...,...,...,...,...
1078,2010,0.948,1.650,-0.212,2.050,1.446,4.449160,0.586,1
841,2006,-0.620,-0.329,2.016,-0.451,-0.171,2.170618,0.049,1
482,1999,0.735,-1.597,2.870,-2.177,4.223,0.869660,-0.535,0
102,1992,-0.807,0.906,-1.011,3.169,5.018,0.215200,-1.613,0


### Define X, y and split data into train and test

In [80]:
train_data = df[:num_of_train]
test_data = df[num_of_train:]

In [81]:
print(train_data.shape)
print(test_data.shape)


(871, 9)
(218, 9)


In [82]:
X_train = train_data.iloc[:, 1:7]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, 1:7]
y_test = test_data.iloc[:, -1]

In [83]:
y_train = y_train.values
y_test = y_test.values

##### Adding column of 1's in fearure matrix for bias term

In [84]:
#Add column of 1's into X_train
X_train_bias = np.c_[np.ones((X_train.shape[0], 1)) ,X_train]

#add column of ones into X_test
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

### SDG Logistic regression

In [85]:
def sigmoid(y):
    return 1/(1+ np.exp(-y.astype(float)))

In [86]:
def logistic_regresion(X, y, lr, epochs, batch_size):
    N, D = X.shape
    W = np.zeros(D)
    gradient = np.inf
    for i in range(epochs):
        
        for b in range(0, N, batch_size):
            X_batch = X[i:i+batch_size]
            y_batch = y[i:i+batch_size]
#             lr = 1 / batch_size

            linear_pred = np.dot(X, W)
            pred = sigmoid(linear_pred)

            if(len(pred) < batch_size):
                break
                
            gradient = (1/N)*np.dot(X.T, (pred - y))
            W = W - lr*gradient
        
    return W

### Training on Train data

In [87]:
#Train w with training set
batch_size = {10, 20, 30, 40, 50, 100}
weights = []
for b in batch_size:
    w = logistic_regresion(X_train_bias, y_train, 0.01, 200, 100)
    weights.append(w)

### Predict on test data

In [88]:
#Predict on test data
c_list = []
for w in weights:
    linear_pred = np.dot(X_test_bias, w)
    y_pred = sigmoid(linear_pred)
    class_prediction = [0 if y<=0.5 else 1 for y in y_pred]
    c_list.append(class_prediction)

### Accuracy calculation

In [89]:
#calculate accuracy
acc_list = []
for c in c_list:
    acc = np.sum(class_prediction != y_test)/ len(y_test)
    acc_list.append(acc)

In [90]:
print(acc_list)

[0.46788990825688076, 0.46788990825688076, 0.46788990825688076, 0.46788990825688076, 0.46788990825688076, 0.46788990825688076]


# KNN Classifier

In [91]:
# pd.to_numeric(df['DataFrame Column'])
for x in X_train:
    pd.to_numeric(df[x])
for x in X_test:
    pd.to_numeric(df[x])

In [92]:
def euclidean_distance(test_point, training_data):
    distances = []
    for x_train in training_data:
        
        distances.append(np.sqrt(np.sum((test_point - x_train)**2)))
    return distances

In [93]:
#Create KNN classifier
def KNN_classifier(test_point, training_data):
    distances = []
    for x_train in training_data:
        distances.append(euclidean_distance(test_point, x_train))
    return distances

In [94]:
def prediction(X_test, X_train, y_train, k=1):
    predictions = []
    for x in X_test:
        k_nearest_labels = []
        distances = KNN_classifier(x, X_train)
        #get the closest k
        k_indices = np.argsort(distances)[:k]
        for i in k_indices:
            k_nearest_labels.append(y_train[i])
            
        # majority vote
        most_common = Counter(k_nearest_labels).most_common()
        predictions.append(most_common[0][0])
    
    return predictions

In [None]:
k_values = {3, 5, 7}
test_errors = {}
for k in k_values:
    # Make predictions on the test set for each k
    predictions  = prediction(X_test.values, X_train.values, y_train, k)
    number_of_error = np.sum(predictions != y_test)
    # Calculate the test error 
    test_error = number_of_error / len(y_test)
    # Store the test error for this k
    test_errors[k] = test_error
    
for k, error in test_errors.items():
    print(f"k={k}: Test Error={error}")

In [None]:
predictions  = prediction(X_test.values, X_train.values, y_train, 1)
print(predictions)

In [26]:
#Define accuracy
knn_acc = []
for y in predictions:
    acc = np.sum(y == y_test)/len(y_test)
    knn_acc.append(acc)

In [27]:
print(knn_acc)

[]
