In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from tqdm import tqdm
import copy

In [2]:
z = ZipFile("../caltech-cs155-2020.zip")

dfs = {text_file.filename: pd.read_csv(z.open(text_file.filename))
       for text_file in z.infolist()
       if text_file.filename.endswith('.csv')}
dfs.keys()

dict_keys(['sample_submission.csv', 'test.csv', 'train.csv'])

In [32]:
df_train = dfs['train.csv']
df_test = dfs['test.csv']
df_train.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,,,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [33]:
# Test if the imputer works in the first place
array_train = df_train.to_numpy()
imp = IterativeImputer(max_iter=100, random_state=0)
imp.fit(array_train)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=100, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [34]:
transform_train = imp.transform(array_train)
# Seems to work well. Let's do some tests whether it performs well too

In [35]:
# Remove all rows with NaN to have it cleaner 
array_no_NaN = array_train[~np.isnan(array_train).any(axis = 1)]
array_no_NaN

array([[1.00000e+00, 3.84280e+03, 3.84340e+03, ..., 1.00000e+00,
        1.30000e+01, 0.00000e+00],
       [2.00000e+00, 3.84400e+03, 3.84430e+03, ..., 4.00000e+00,
        9.00000e+00, 0.00000e+00],
       [3.00000e+00, 3.84380e+03, 3.84340e+03, ..., 2.00000e+00,
        1.10000e+01, 1.00000e+00],
       ...,
       [5.92374e+05, 4.10940e+03, 4.10980e+03, ..., 1.00000e+01,
        7.00000e+00, 1.00000e+00],
       [5.92375e+05, 4.11020e+03, 4.11030e+03, ..., 7.00000e+00,
        7.00000e+00, 1.00000e+00],
       [5.92376e+05, 4.10940e+03, 4.11050e+03, ..., 7.00000e+00,
        5.00000e+00, 0.00000e+00]])

In [36]:
def produce_NaN(array):
    row, col = np.random.randint(len(array_no_NaN)), np.random.randint(28)
    value = array[row, col]
    array[row, col] = np.NaN
    value_dict = {(row, col) : value}
    
    return array, value_dict

In [37]:
N = 100
NaN_true_dict = {}
for i in tqdm(range(N)):
    out = produce_NaN(array_no_NaN)
    NaN_true_dict.update(out[1])
    array_no_NaN = out[0]

key_list = list(NaN_true_dict.keys())

100%|██████████| 100/100 [00:00<00:00, 49466.97it/s]


In [38]:
imp.fit(array_no_NaN)
test = imp.transform(array_no_NaN)

In [39]:
Loss_array = np.empty(N)

for j, i in enumerate(key_list):
    value = test[i[0]][i[1]]
    loss = (NaN_true_dict[i] - value)**2
    Loss_array[j] = loss

In [40]:
mse = np.average(Loss_array)
print(mse)

8.46772518696692


Ok, this seems pretty bad, but we can still see how it performs for predictions. There are a few more things we can try:

- k-nearest neighbor imputation on the full data
- k-nearest on each feature vector
- normalized feature vector matrix imputation
- matrix imputation on single feature vectors
- simple imputation with the mean from the feature vector

## Normalized feature vectors

In [41]:
# Normalize feature vectors

norm_train = np.empty((len(array_no_NaN), 28))

for i in range(28):
    col_max = max(array_no_NaN[:, i])
    col_min = min(array_no_NaN[:, i])
    norm_train[:, i] = (array_no_NaN[:, i] - col_min) / (col_max - col_min)

In [42]:
N = 100
NaN_norm_dict = {}
for i in tqdm(range(N)):
    out = produce_NaN(norm_train)
    NaN_norm_dict.update(out[1])
    norm_train = out[0]

key_list_norm = list(NaN_norm_dict.keys())

100%|██████████| 100/100 [00:00<00:00, 50105.17it/s]


In [43]:
imp.fit(norm_train)
test = imp.transform(norm_train)

Loss_array = np.empty(N)

for j, i in enumerate(key_list_norm):
    value = test[i[0]][i[1]]
    loss = (NaN_norm_dict[i] - value)**2
    Loss_array[j] = loss
    
mse = np.average(Loss_array)
print(mse)

0.007850902672973293


Looks pretty good! But let's try some other one's

## Lets check where the NaNs are

In [44]:
array_train = df_train.to_numpy()

for i in range(28):
    feature = array_train[:,i]
    print(np.isnan(feature).any())


False
False
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [45]:
np.argwhere(np.isnan(array_train[:,3])) == np.argwhere(np.isnan(array_train[:,4]))

array([[ True],
       [ True],
       [ True],
       ...,
       [ True],
       [ True],
       [ True]])

## Matrix imputation at only one feature vector
N.B.: we found that NaN's only occur at feature 3 and 4

In [46]:
vector_3 = copy.deepcopy(array_train[:,3])
vector_4 = copy.deepcopy(array_train[:,4])

# No more NaN's
vector_3 = vector_3[np.logical_not(np.isnan(vector_3))]
vector_4 = vector_4[np.logical_not(np.isnan(vector_4))]

#Normalize vector
col_max_3 = max(vector_3)
col_min_3 = min(vector_3)
vector_3 = (vector_3 - col_min_3)/(col_max_3 - col_min_3)

col_max_4 = max(vector_4)
col_min_4 = min(vector_4)
vector_4 = (vector_4 - col_min_4)/(col_max_4 - col_min_4)

In [47]:
vector_3_dict = {}
vector_4_dict = {}
indices = np.empty(int(len(vector_3)/10))

for i in range(int(len(vector_3)/10)):
    index = np.random.randint(len(vector_3))
    indices[i] = index
    
    value_3 = vector_3[index]
    temp_3 = {index : value_3}
    vector_3_dict.update(temp_3)
    
    value_4 = vector_4[index]
    temp_4 = {index : value_4}
    vector_4_dict.update(temp_4)
    
for j in indices:
    vector_3[int(j)] = np.NaN
    vector_4[int(j)] = np.NaN

In [48]:
vector_3_keys = list(vector_3_dict.keys())
vector_4_keys = list(vector_4_dict.keys())

vector_3 = np.reshape(vector_3, (-1,1))
vector_4 = np.reshape(vector_4, (-1,1))

In [49]:
imp.fit(vector_3)
test_3 = imp.transform(vector_3)
test_3 = test_3.flatten()

Loss_array = []

for i in vector_3_keys:
    value = test_3[i]
    loss = (vector_3_dict[i] - value)**2
    Loss_array.append(loss)
    
imp.fit(vector_4)
test_4 = imp.transform(vector_4)
test_4 = test_4.flatten()
    
for j in vector_4_keys:
    value = test_4[j]
    loss = (vector_4_dict[j] - value)**2
    Loss_array.append(loss)
    
mse = np.average(Loss_array)
print(mse)

0.00046725852362711234


## KNN Imputer unfortunately does not work, we get a memory error

In [4]:
array_train = df_train.to_numpy()
array_simple_mean = copy.deepcopy(array_train[:,3:5])
array_simple_median = copy.deepcopy(array_train[:,3:5])

## Set NaN's to be the mean or median

In [25]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_median = SimpleImputer(missing_values = np.NaN, strategy = 'median')

In [286]:
imp_mean.fit(array_simple_mean)
transformed_mean_array = imp_mean.transform(array_simple_mean)

imp_median.fit(array_simple_median)
transformed_median_array = imp_mean.transform(array_simple_median)

In [287]:
transformed_mean_array

array([[ 1.40058344,  1.96427653],
       [ 6.        , 49.        ],
       [ 7.        , 77.        ],
       ...,
       [ 1.40058344,  1.96427653],
       [ 1.40058344,  1.96427653],
       [ 1.40058344,  1.96427653]])

In [288]:
transformed_median_array

array([[ 1.40058344,  1.96427653],
       [ 6.        , 49.        ],
       [ 7.        , 77.        ],
       ...,
       [ 1.40058344,  1.96427653],
       [ 1.40058344,  1.96427653],
       [ 1.40058344,  1.96427653]])

In [15]:
simple_no_NaN = array_train[~np.isnan(array_train).any(axis=1)]

In [21]:
vector_3 = copy.deepcopy(simple_no_NaN[:,3])
vector_4 = copy.deepcopy(simple_no_NaN[:,4])

#Normalize vector
col_max_3 = max(vector_3)
col_min_3 = min(vector_3)
vector_3 = (vector_3 - col_min_3)/(col_max_3 - col_min_3)

col_max_4 = max(vector_4)
col_min_4 = min(vector_4)
vector_4 = (vector_4 - col_min_4)/(col_max_4 - col_min_4)

In [23]:
vector_3_dict = {}
vector_4_dict = {}
indices = np.empty(int(len(vector_3)/10))

for i in range(int(len(vector_3)/10)):
    index = np.random.randint(len(vector_3))
    indices[i] = index
    
    value_3 = vector_3[index]
    temp_3 = {index : value_3}
    vector_3_dict.update(temp_3)
    
    value_4 = vector_4[index]
    temp_4 = {index : value_4}
    vector_4_dict.update(temp_4)
    
for j in indices:
    vector_3[int(j)] = np.NaN
    vector_4[int(j)] = np.NaN

In [24]:
vector_3_keys = list(vector_3_dict.keys())
vector_4_keys = list(vector_4_dict.keys())

vector_3 = np.reshape(vector_3, (-1,1))
vector_4 = np.reshape(vector_4, (-1,1))

In [26]:
imp_mean.fit(vector_3)
test_3 = imp_mean.transform(vector_3)
test_3 = test_3.flatten()

Loss_array = []

for i in vector_3_keys:
    value = test_3[i]
    loss = (vector_3_dict[i] - value)**2
    Loss_array.append(loss)
    
imp_mean.fit(vector_4)
test_4 = imp_mean.transform(vector_4)
test_4 = test_4.flatten()
    
for j in vector_4_keys:
    value = test_4[j]
    loss = (vector_4_dict[j] - value)**2
    Loss_array.append(loss)
    
mse = np.average(Loss_array)
print(mse)

0.0004925922832461261


In [31]:
test_4[49497]

0.009275416206993202

In [29]:
np.average(test_4)

0.009275416206993204

In [50]:
imp_median.fit(vector_3)
test_3 = imp_median.transform(vector_3)
test_3 = test_3.flatten()

Loss_array = []

for i in vector_3_keys:
    value = test_3[i]
    loss = (vector_3_dict[i] - value)**2
    Loss_array.append(loss)
    
imp_median.fit(vector_4)
test_4 = imp_median.transform(vector_4)
test_4 = test_4.flatten()
    
for j in vector_4_keys:
    value = test_4[j]
    loss = (vector_4_dict[j] - value)**2
    Loss_array.append(loss)
    
mse = np.average(Loss_array)
print(mse)

0.0004885148549834864


In [54]:
test_4[300239]

0.0047169811320754715

In [52]:
np.median(test_4)

0.0047169811320754715