### This notebook contains the first questions of our Assignment about Data Visualization, Cross Validation, Linear Regression and Logistic Regression

##### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report
from sklearn import metrics

import statsmodels.api as sm
from scipy import stats

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold

##### Importing Dataset

In [None]:
dataset = pd.read_csv("Data/BTC-USD.csv")
print(dataset.shape)
dataset.head()

In [None]:
dataset.shape

We noticed that Close and Adj CLose columns are exactly the same.

In [None]:
(dataset['Close'] - dataset['Adj Close']).sum()

Open The price of the coin at the beginning of the trading day. </br>

High: The highest price of the coin on a trading day. </br>

Low: The lowest price of the coin on a trading day. </br>

Close: The last price of the coin before the trading day ends. </br>

Volume is the amount of a token traded in a specific time interval. </br>

We converted Data column in datetime type to use it more efficiently

In [None]:
dataset["Date"] = pd.to_datetime(dataset['Date'])
dataset['Date']

In [None]:
dataset['Year'] = dataset['Date'].dt.year
dataset

In [None]:
dataset.describe()

#### Preprocessing

In [None]:
dataset["Close"].shape

In [None]:
dataset['pos_neg'] = dataset['Open'] - dataset['Close']
dataset.head()
#if Open > Close then 0,else 1
dataset['Up/Down'] = np.where(dataset['pos_neg'] > 0, '0', '1')

In [None]:
dataset['Up/Down'].unique()

That column contains only 0 and 1.

We have to check for NaN values in our dataset

In [None]:
dataset.isna().sum()

In order to use MinMaxScaler, we have to check for zero values in our dataset because MinMaxScaler scales all the data features in the range [0, 1] or else in the range [-1, 1] if there are negative values in the dataset

In [None]:
(dataset['Open'] < 0).sum()

In [None]:
(dataset['Close'] < 0).sum()

In [None]:
(dataset['High'] < 0).sum()

In [None]:
(dataset['Low'] < 0).sum()

In [None]:
(dataset['Volume'] < 0).sum()

#### Visualization

We used the next formula to check the relationship between the variables and we ended to the conclusion that there is a strong relationship between them.

In [None]:
model = sm.formula.ols(formula="Close ~ High + Low + Open", data=dataset)
multi_reg = model.fit()
print(multi_reg.summary())

##### Diagrams

In [None]:
dataset['Close'].plot()
plt.ylabel("Daily Bitcoin price")

In [None]:
dataset['Close'].plot(style='k.',kind='hist')
plt.title('Hisogram of closing price')
plt.show()

In [None]:
by_year = dataset['Close'].groupby(dataset['Year']).mean()
by_year.plot()

In [None]:
plt.plot('Date', 'Volume', data=dataset)
plt.xlabel('Year')
plt.ylabel('Volume')
plt.title('Volume trend')

In [None]:
ax1 = dataset.plot.scatter(x='Date', y='Close', c='DarkBlue')

In [None]:
ax1 = dataset.plot.scatter(x='Date',
                      y='Volume',
                      c='DarkBlue')

#### Dataset scaling
We scaled our dataset using MinMaxScaler

In [None]:
scaled_data = dataset[['Open', 'High', 'Low', 'Close', 'Volume']]
scaler = MinMaxScaler(copy=False)
scaled_data[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(scaled_data[['Open', 'High', 'Low', 'Close', 'Volume']])
scaled_data

In [None]:
scaled_data["Up/Down"] = dataset["Up/Down"].copy()

In [None]:
fig, axes = plt.subplots(1,2)
axes[0].scatter(dataset.Close, dataset.Close)
axes[0].set_title("Original data")
axes[1].scatter(scaled_data.Close, scaled_data.Close)
axes[1].set_title("MinMax scaled data")
plt.show()

Now we are ready to create our models!

## Linear Regression 

###### Creating X and y variables

In [None]:
X = scaled_data[['Open', 'High', 'Low', 'Volume']]
y = scaled_data['Close']

##### Splitting the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

##### Cross Validation Score and Predict Score

In [None]:
cv = KFold(n_splits=5, random_state=1, shuffle=True)
linReg = LinearRegression()
scores = cross_val_score(linReg, X, y, scoring='r2', cv=cv, n_jobs=1)
scores

In [None]:
predicted = cross_val_predict(linReg, X, y, cv=6)
scores = cross_val_score(linReg, X, y, scoring='r2', cv=36, n_jobs=1)
scores

##### Plots

In [None]:
fig,ax = plt.subplots()
ax.scatter(y, predicted, edgecolors=(1, 1, 1))
ax.plot([y.min(), y.max()], [y.min(), y.max()], color='red')
ax.set_xlabel('Expected ')
ax.set_ylabel('Predicted ')
plt.show()

In [None]:
fig,ax = plt.subplots()
plt.plot(scores)
ax.set_xlabel('Folds')
ax.set_ylabel('R2 score')
plt.show()

##### Training the Model using KFold split

In [None]:
kf = KFold(n_splits=10, shuffle=True)
scores = []
errors = []
i=1

for training,testing in kf.split(X,y):
    X_train,X_test = X.loc[training], X.loc[testing]
    y_train,y_test = y.loc[training], y.loc[testing]
    print ('{}/10 Folds {}'.format(i, kf.n_splits))
    print('--------------------------')
    
    linReg = LinearRegression()
    linReg.fit(X_train,y_train)
    score = metrics.r2_score(y_test,linReg.predict(X_test))
    error = metrics.mean_squared_error(y_test, linReg.predict(X_test))
    print('R2 Score: ', score)
    print('MSE: ', error)
    scores.append(score)
    errors.append(error)
    print('\n')
    i += 1

In [None]:
fig, ax = plt.subplots()
plt.plot(errors)
ax.set_xlabel('Folds')
ax.set_ylabel('Errors')
plt.show()

In [None]:
print("Coefficients: \n", linReg.coef_)
print("Intercept: \n", linReg.intercept_)

##### Testing the Model

In [None]:
y_pred_lin = linReg.predict(X_test)
dfp = pd.DataFrame({'Actual_Price': y_test, 'Predicted_Price': y_pred_lin})
dfp.head()

##### Polynomial Equation

In [None]:
y = linReg.coef_[0] * X_test['Open'] + linReg.coef_[1] * X_test['High'] + linReg.coef_[2] * X_test['Low'] + linReg.coef_[3] * X_test['Volume'] + linReg.intercept_

##### Plotting

In [None]:
plt.subplots(1, figsize=(12,12))
plt.plot(np.linspace(0, y_test.size, y_test.size), y_test, color='red', label='Actual Data')
plt.plot(np.linspace(0, y_test.size, y_test.size), y, label='Predicted Data')
#plt.xlabel('Expected')
#plt.ylabel('Predicted')
plt.grid(color='#000000', linestyle='-', linewidth=0.5)
plt.legend(loc="upper left")

##### Scatter plot

In [None]:
plt.scatter(dfp['Actual_Price'], dfp['Predicted_Price'])

In [None]:
reg_score = linReg.score(X_test, y_test)
print("Linear Regression Score: ", reg_score)
print("Absolute Squared Error: ", mean_absolute_error(y_test, y_pred_lin))
mse_lin = mean_squared_error(y_test, y_pred_lin)
print("Mean Squared Error: ", mse_lin)

As we can see, the score is extremely high and our model's predicitons are so different from the expected values.

## Logistic Regression 

##### Creating X and y variables

In [None]:
X = scaled_data[['Open', 'High', 'Low', 'Close','Volume']]
y = dataset["Up/Down"].values

##### Splitting the data

In [None]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

##### Training 

In [None]:
cv = KFold(n_splits=5, random_state=1, shuffle=True)
lr = LogisticRegression()
scores = cross_val_score(lr, X, y, scoring='roc_auc', cv=cv, n_jobs=1)
scores

In [None]:
predicted = cross_val_predict(lr, X, y, cv=6)
scores = cross_val_score(lr, X, y, scoring='roc_auc', cv=36, n_jobs=1)
scores

In [None]:
fig,ax = plt.subplots()
plt.plot(scores)
ax.set_xlabel('Folds')
ax.set_ylabel('AUC Score')
plt.show()

##### Creating the model

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred_log = lr.predict(X_test)
dfp = pd.DataFrame({'Actual_Price': y_test, 'Predicted_Price': y_pred_log})
dfp.head()

In [None]:
y_pred_log = lr.predict(X_test)
y_pred_log_probs = lr.predict_proba(X_test)[:, 1]

test_accuracy = accuracy_score(y_test, y_pred_log)
test_auc_roc = roc_auc_score(y_test, y_pred_log_probs)

print('Confusion matrix:\n', confusion_matrix(y_test, y_pred_log))

print('Testing AUC: ', test_auc_roc)

print('Testing accuracy: ', test_accuracy)

##### Metrics

In [None]:
score = metrics.roc_auc_score(y_test, lr.predict(X_test))
error = metrics.log_loss(y_test, lr.predict(X_test))
print('ROC AUC Score: ', score)
print('Loss: ', error)

In [None]:
print(classification_report(y_test, y_pred_log, digits=2))

As we can see, we are not satisfied from the score that Logistic Regression returns. </br>
Next Step, Neural Network

# **Neural Network**

##### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import math
from math import floor
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data

torch.manual_seed(42)

# Visualize training history
#from keras.models import Sequential
#from keras.layers import Dense

pd.options.mode.chained_assignment = None 

### Model Creation

In [None]:
# Choose CPU or Cuda device (If capable GPU present).
device = 'cpu'

class NeuralNetwork(nn.Module):

    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.hidden1 = torch.nn.Linear(50, 25) # hidden layer
        self.hidden2 = torch.nn.Linear(25, 25) # hidden layer
        self.out = torch.nn.Linear(25, 1)      # output layer

    def forward(self, x):
        z = F.relu(self.hidden1(x)) # activation function for first hidden layer
        z = F.relu(self.hidden2(z)) # activation function for second hidden layer
        z = self.out(z)        # linear output
        return z

model = NeuralNetwork().to(device)
print(model)

### Dataset Loading

In [None]:
dataset = pd.read_csv("Data/BTC-USD.csv")
print(dataset.shape)
dataset.head()

#### Scaling Dataset 
We scaled our dataset using MinMaxScaler

In [None]:
scaled_data = dataset[['Open', 'High', 'Low', 'Close', 'Volume']]
scaler = MinMaxScaler(copy=False)
scaled_data[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(scaled_data[['Open', 'High', 'Low', 'Close', 'Volume']])
X = scaled_data['Close']
X

In this moment, we had to create a TimeSeries function in order to create a scaling window. This windows helped us to create the batches for our neural network

In [None]:
class TimeSeriesDataset(Data.Dataset):   
    def __init__(self, data, window):
        self.data = torch.Tensor(data.values)
        self.window = window
    
    def __getitem__(self, index):
        return (self.data[index:index+self.window], self.data[index+self.window])
    
    def __len__(self):
        return self.data.__len__() - (self.window)

#### Data Preperation

In [None]:
split_ratio = 0.8
train_size = floor(X.size*split_ratio)
test_size = floor(X.size*(1-split_ratio))

train_dataset = TimeSeriesDataset(X[:train_size], 50)
test_dataset = TimeSeriesDataset(X[train_size:], 50)
    
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss() 

#### Training Function

In [None]:
def train(dataset, model, loss_fn, optimizer):
    size = len(dataset)
    tlosses = []
    model.train()
    for batch, (X, y) in enumerate(dataset):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X).squeeze(-1)
        loss = loss_fn(pred, y)
        
        tlosses.append(loss.item())
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch
            print(f"loss: {loss:>7f}  Batch: [{current:>5d}/{size:>5d}]")
    return tlosses

#### Testing Function

In [None]:
def test(dataset, model, loss_fn):
    size = len(dataset)
    vlosses = []
    num_batches = len(dataset) - 50
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataset:
            X, y = X.to(device), y.to(device)
            pred = model(X).squeeze(-1)
            test_loss += loss_fn(pred, y).item()
            vlosses.append(test_loss)
    test_loss /= num_batches
    print(f"Test Error: \n Avg loss: {test_loss:>8f} \n")
    #print(f"Avg Accuracy:  {1-test_loss:>8f} \n")
    return vlosses

#### Training the model for 20 epochs

In [None]:
EPOCHS = 20
tlosses = []
vlosses = []

for t in range(EPOCHS):
    print(f"Epoch {t + 1}\n-------------------------------")
    etlosses = train(train_dataset, model, loss_fn, optimizer)
    tlosses.append(sum(etlosses)/len(etlosses))
    evlosses = test(test_dataset, model, loss_fn)
    vlosses.append(sum(evlosses)/len(evlosses))
print("Done!")

## Final Prediction

In [None]:
model.eval()
data = dataset['Close']
data = torch.Tensor(data[-50:].values)
output = model(data)
print(f'Predicted next-day price based on 50 previous ones: {output.item()}')

## Plotting Losses (Training and Testing)

In [None]:
plt.plot(np.linspace(1, EPOCHS, EPOCHS), tlosses)
plt.plot(np.linspace(1, EPOCHS, EPOCHS), vlosses) 
plt.legend(['Training Loss', 'Testing Loss'])

# **Comparing Models**

In [None]:
compDf = pd.DataFrame({'Linear Regression': mse_lin, 'Logistic Regression': error, 'Neural Network': vlosses[-1]},index=[0])
compDf

As we can see, Linear Regression has the best metrics. 