In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import RocCurveDisplay
from tqdm import tqdm
import math
from sklearn import metrics

In [20]:
def PCA(X , num_components):
     
    #Step-1
    X_meaned = X - np.mean(X , axis = 0)
     
    #Step-2
    cov_mat = np.cov(X_meaned , rowvar = False)
     
    #Step-3
    eigen_values , eigen_vectors = np.linalg.eigh(cov_mat)
     
    #Step-4
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalue = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:,sorted_index]
     
    #Step-5
    eigenvector_subset = sorted_eigenvectors[:,0:num_components]
     
    #Step-6
    X_reduced = np.dot(eigenvector_subset.transpose() , X_meaned.transpose() ).transpose()
     
    return X_reduced

In [21]:
class Logistic_Regression:
    
    def __init__(self, X, y, learningRate = 0.00001, tolerance = 0.00005,maxIteration = 5000 ):
        self.X_train = X
        self.y_train = y
        self.tolerance = tolerance
        self.maxIteration = maxIteration
        self.learningRate = learningRate
    
    def addX0(self):
        return np.column_stack([np.ones([X.shape[0],1]), X])
    
    def sigmoid(self, z):
        sig= 1 / (1+ np.exp(-z))
        return sig
    
    def costFunction (self, X, y):
        pred_ = np.log(np.ones(X.shape[0]) + np.exp(X.dot(self.w))) - X.dot(self.w).dot(y)
        cost = pred_.sum()
        return cost
    
    def gradient(self, X, y):
        sigmoid = self.sigmoid(X.dot(self.w))
        grad = (sigmoid - y).dot(X)
        return grad
    
    def gradientDescent(self, X, y):
        
        errors= []
        last = float('inf')
        
        for i in tqdm(range(self.maxIteration)):
            self.w = self.w - self.learningRate * self.gradient(X, y)
            curr = self.costFunction(X, y)
            
            diff = last - curr
            last = curr
            
            errors.append(curr)
            
        
    def predict(self, X):
        pred = self.sigmoid(X.dot(self.w))
        return np.around(pred)
        
    def evaluate(self, y, y_hat):
            
        y = (y == 1)
        y_hat= (y_hat == 1)
            
        accuracy = (y == y_hat).sum() / y.size
        precision = (y & y_hat).sum() / y_hat.sum()
        recall = (y & y_hat).sum() / y.sum()
        #print('accuracy was:' +str(accuracy) )
        #print('precision was:' +str(precision) )
        #print('recall was:' +str(recall) )
        return recall, precision, accuracy
        
    def fit(self):
            
        self.w = np.ones(self.X_train.shape[1], dtype = np.float64) * 0
        self.gradientDescent(self.X_train, self.y_train)
            
            
        y_hat_train = self.predict(self.X_train)
        recall, precision, accuracy = self.evaluate(self.y_train, y_hat_train)

In [22]:
def NN (X_train,y_train,X_test,y_test,input_shape):
    model = keras.Sequential([
    keras.layers.Flatten(input_shape=(input_shape,)),
    #keras.layers.Dense(264, activation=tf.nn.relu),
    #keras.layers.Dense(132, activation=tf.nn.relu),
    #keras.layers.Dense(264, activation=tf.nn.relu),
    #keras.layers.Dropout(0.5),
    keras.layers.Dense(64, activation=tf.nn.relu),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation=tf.nn.sigmoid)
    ])

    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy','AUC','mse'])

    model.fit(X_train, y_train, epochs=10, batch_size=10)

    test_loss, test_acc, auc, rmse = model.evaluate(X_test, y_test)
    print('Test accuracy:', test_acc)
    print('AUC:', auc)
    
    
    pred=model.predict(X_test)
    
    
    
    for i in range(0,len(pred)):
        if(pred[i]>0.5):
            pred[i]=1
        else:
            pred[i]=0
    cm=confusion_matrix(y_test, pred)
    print('Precision: '+str(cm[1][1]/(cm[1][1]+cm[0][1])))
    print('Recall: '+str(cm[1][1]/(cm[1][1]+cm[1][0])))
    
    
  
    
    

In [18]:
 for m in range (1,6):
        
    #IMPORT DATA
    dataset=pd.read_csv('data.csv')
    dataset=dataset.replace('?',np.nan)
    dataset.to_csv('Data.csv',index=False)
    dataset=dataset[dataset['timeframe']==m]#.drop(['timeframe'], axis = 1)
    dataset.reset_index(inplace=True)
    dataset.drop(['index','timeframe'],axis=1,inplace=True)
    
    X=dataset.iloc[:,:-1].values
    y=dataset.iloc[:,-1].values
    
    
    
    
    imputer = SimpleImputer(missing_values=np.nan,strategy='median')
    imputer.fit(X[:,:])
    X[:,:] = imputer.transform(X[:,:])
    
    PCA
    X=PCA(X,32)
    
    #TRAIN TEST SPLIT
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    
    #IQR OUTLIER TRUNCATING
    for i in range(0,len(X[0])):
        low= np.quantile(X_train[:,i],0.15)
        high= np.quantile(X_train[:,i],0.85)
        X_train[:,i] = np.where(X_train[:,i] <low, low,X_train[:,i])
        X_train[:,i] = np.where(X_train[:,i] >high, high,X_train[:,i])
        X_test[:,i] = np.where(X_test[:,i] <low, low,X_test[:,i])
        X_test[:,i] = np.where(X_test[:,i] >high, high,X_test[:,i])
        
    
    #DATA OVERSAMPLING
    smote = SMOTE()
    #X_test_, y_test_ = X_test, y_test
    X_test, y_test = smote.fit_resample(X_test,y_test)
    X_train, y_train = smote.fit_resample(X_train,y_train)
    
    
    #STANDARD SCALING
    sc = StandardScaler()
    X_train= sc.fit_transform(X_train)
    X_test= sc.fit_transform(X_test)
    
    
    #NEURAL NETWORK
    print('XXXXXXXXXXXXX')
    print("DATASET: "+str(m))
    print('XXXXXXXXXXXXX\n\n')
    
    print('----------------------------------')
    print('--------NEURAL NETWORK------------')
    print('----------------------------------')
    print(' ')
    

    NN(X_train,y_train,X_test,y_test,X.shape[1])
    print('\n\n')
    
   
    
    #LOGISTIC REGRESSION
    print('----------------------------------')
    print('-------LOGISTIC REGRESSION--------')
    print('----------------------------------')
    print(' ')
    
    
    
    lr = Logistic_Regression(X_train, y_train)
    lr.fit()
    pred=lr.predict(X_test)
    from sklearn.metrics import accuracy_score
    print(('Test accuracy: ')+str(accuracy_score(y_test, pred)))
    cm = confusion_matrix(y_test, pred)
    print('Precision: '+str(cm[1][1]/(cm[1][1]+cm[0][1])))
    print('Recall: '+str(cm[1][1]/(cm[1][1]+cm[1][0])))
    
    
    
    print('\n\n')
        

XXXXXXXXXXXXX
DATASET: 1
XXXXXXXXXXXXX


----------------------------------
--------NEURAL NETWORK------------
----------------------------------
 
Epoch 1/10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.635371208190918
AUC: 0.7431184649467468
Precision: 0.7540983606557377
Recall: 0.4017467248908297



----------------------------------
-------LOGISTIC REGRESSION--------
----------------------------------
 


100%|██████████| 5000/5000 [00:01<00:00, 3296.91it/s]


Test accuracy: 0.6451965065502183
Precision: 0.6423982869379015
Recall: 0.6550218340611353



XXXXXXXXXXXXX
DATASET: 2
XXXXXXXXXXXXX


----------------------------------
--------NEURAL NETWORK------------
----------------------------------
 
Epoch 1/10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.5368188619613647
AUC: 0.5868837833404541
Precision: 0.625
Recall: 0.18409425625920472



----------------------------------
-------LOGISTIC REGRESSION--------
----------------------------------
 


100%|██████████| 5000/5000 [00:02<00:00, 2443.52it/s]


Test accuracy: 0.5589101620029455
Precision: 0.5533333333333333
Recall: 0.6111929307805597



XXXXXXXXXXXXX
DATASET: 3
XXXXXXXXXXXXX


----------------------------------
--------NEURAL NETWORK------------
----------------------------------
 
Epoch 1/10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.628387987613678
AUC: 0.7223835587501526
Precision: 0.7284263959390863
Recall: 0.4094151212553495



----------------------------------
-------LOGISTIC REGRESSION--------
----------------------------------
 


100%|██████████| 5000/5000 [00:02<00:00, 2277.86it/s]


Test accuracy: 0.5970042796005706
Precision: 0.5901856763925729
Recall: 0.6348074179743224



XXXXXXXXXXXXX
DATASET: 4
XXXXXXXXXXXXX


----------------------------------
--------NEURAL NETWORK------------
----------------------------------
 
Epoch 1/10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.6468373537063599
AUC: 0.7793164253234863
Precision: 0.7656675749318801
Recall: 0.42319277108433734



----------------------------------
-------LOGISTIC REGRESSION--------
----------------------------------
 


100%|██████████| 5000/5000 [00:01<00:00, 2508.26it/s]


Test accuracy: 0.7010542168674698
Precision: 0.6811397557666214
Recall: 0.7560240963855421



XXXXXXXXXXXXX
DATASET: 5
XXXXXXXXXXXXX


----------------------------------
--------NEURAL NETWORK------------
----------------------------------
 
Epoch 1/10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.7251908183097839
AUC: 0.7920770049095154
Precision: 0.7940199335548173
Recall: 0.6081424936386769



----------------------------------
-------LOGISTIC REGRESSION--------
----------------------------------
 


100%|██████████| 5000/5000 [00:01<00:00, 3934.15it/s]

Test accuracy: 0.7150127226463104
Precision: 0.7217847769028871
Recall: 0.6997455470737913






