In [1]:
import numpy as np
import pandas as pd
import time # for tracking start and end time
import statistics as st # for calculating mean and stdev.

# library for split the dataset
from sklearn.model_selection import train_test_split

# full sample size
sample_size1= 5000

# create "random number generator" object
# 888 is the seed so you can replicate the simulation results.
# otherwise, even the code is the same, every time you re-run you will have different results.
rand = np.random.RandomState(888)

# simulating normal random variable, 
df=pd.DataFrame(rand.normal(0,1, (sample_size1,5))) #you can replace normal by other distribution's name
# 0,1 are the mean and standard error
# 5 is the number of columns and 5000(=sample_size1) is the number of rows

# just add the column name
df=df.add_prefix('X') 

# add the outcome variable Y, this is the true pattern of this predictive regression problem
def gen_Y(X0,X1,X2,X3,X4):
    Y=0.01*(X1/X2)-0.01*(X3/X4)+X0 # the true pattern, which is arbitrarily created as an example
    Y[Y>=0]=1 # creating a binary label with about 50% of 0 and 1
    Y[Y<0]=0
    Y[4500:sample_size1]=1-Y[4500:sample_size1] # I flip the last 500 rows, if it is 1 then it becomes 0, if it is 0, then it becomes 1.
    # So the best possible accuracy is around 90%
    return Y

df['Y']=gen_Y(df.X0,df.X1,df.X2,df.X3,df.X4)
df.describe() # the complete dataset

Unnamed: 0,X0,X1,X2,X3,X4,Y
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,-0.023237,0.039913,0.013534,0.005379,0.002783,0.495
std,1.005383,0.992115,1.015794,0.990326,1.005469,0.500025
min,-3.76494,-3.226533,-3.440813,-3.814431,-3.322366,0.0
25%,-0.7063,-0.623876,-0.687176,-0.661619,-0.675705,0.0
50%,-0.017885,0.030241,0.031031,0.013733,0.005609,0.0
75%,0.639442,0.712569,0.718059,0.670766,0.691538,1.0
max,3.557303,3.77857,3.279497,3.355865,4.399775,1.0


In [2]:
# load library for Naive Bayes
from sklearn.naive_bayes import GaussianNB

start_time = time.time() # for tracknig time
test_error = [] # for initialize an array object for saving output

# number of rounds of simulations
sim_rounds = 1000

for i in range(sim_rounds):
    # Data Preparation
    df=pd.DataFrame(rand.normal(0,1, (sample_size1,5))) # generate matrix of X
    df=df.add_prefix('X') # add column names
    df['Y']=gen_Y(df.X0,df.X1,df.X2,df.X3,df.X4) # generate Y via my own function
    dataX = df.copy().drop(['Y'],axis=1) # this can come directly from simulating X if you prefer
    dataY = df['Y'].copy() # This can come directly from Y function
    
    # Model Preparation, we split the dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(dataX, 
                                    dataY, test_size=0.2, #20% as test set 
                                    stratify=dataY)
    
    # only two lines of codes for NB
    gnb = GaussianNB() # initiate an object of NB, no need to tune or specify input parameters, easier for users
    
    # gnb.fit(X_train, y_train) is to contruct a classifier model based on training data
    # .predict(X_test) means we take X_test as input features, send into NB classifier
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    
    # Record accuracy
    test_error.append((y_test != y_pred).sum()/y_test.size)

print("--- Test Error is: %s ---" % (st.mean(test_error)))
print("--- %s seconds ---" % (time.time() - start_time))

--- Test Error is: 0.124822 ---
--- 7.588811159133911 seconds ---


In [3]:
# better code to general performance metrics, rather than by hardcoding error rates
from sklearn.metrics import confusion_matrix,accuracy_score
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.865
[[415  75]
 [ 60 450]]


In [4]:
# library for KNN
from sklearn.neighbors import KNeighborsClassifier

start_time = time.time()
train_error = []
test_error = []

# number of rounds of simulations
sim_rounds = 1000

for i in range(sim_rounds):
    # data preparation
    df=pd.DataFrame(rand.normal(0,1, (sample_size1,5))) # generate matrix of X
    df=df.add_prefix('X') # add column names
    df['Y']=gen_Y(df.X0,df.X1,df.X2,df.X3,df.X4) # generate Y
    dataX = df.copy().drop(['Y'],axis=1) # this can come directly from simulated X
    dataY = df['Y'].copy() # this can come directly from Y function
    
    # Model Preparation, we split the dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(dataX, 
                                    dataY, test_size=0.2, #20% as test set 
                                    stratify=dataY)
    
    # build the model, there is no training model, so language is different
    knn = KNeighborsClassifier(n_neighbors=5) # 5 is the most important parameter K
    knn.fit(X_train, y_train)
    
    # to get training set accuracy, use the following
    # knn.score(X_train, y_train)
    # to get test set accuracy, use the following
    # knn.score(X_test, y_test)
    train_error.append(1-knn.score(X_train, y_train))
    test_error.append(1-knn.score(X_test, y_test))

print("--- Test Error is: %s ---" % st.mean(test_error))
print("--- Training Error is: %s ---" % st.mean(train_error))
print("--- %s seconds ---" % (time.time() - start_time))

--- Test Error is: 0.17209500000000003 ---
--- Training Error is: 0.12820275 ---
--- 180.17641258239746 seconds ---
