In [1]:
import numpy as np

# Data Extraction
import pandas as pd

# Machine Learning
import tensorflow as tf
import sklearn

# 2. Implementation <a class="anchor" id="implementation"></a>

## 2.2. Fetch Clean Data

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
def one_hot(array):
    
    print("> Transforming labels into one-hot vectors...")
    
    onehot_encoder = OneHotEncoder(sparse=False)
    
    # assuming array has already been transformed into integer encodings
    # now, convert to binary (one-hot)
    array = array.reshape(len(array), 1)
    results = onehot_encoder.fit_transform(array)
            
    return results

In [3]:
clean_data_path = "../dataset/clean_data.csv"

In [4]:
from sklearn.model_selection import train_test_split
def split_data(data, labels, train_perc):
    
    test_perc = round(1-train_perc, 2)
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_perc, test_size=test_perc, random_state=42)

    return x_train, x_test, y_train, y_test

In [5]:
df = pd.read_csv(clean_data_path, sep=',', encoding='ISO-8859-1', header=None)
clean_data = np.array(df)

# get rid of rows containing "nan" in clean data file
rows_to_delete = []
for i, row in enumerate(clean_data):
    for j, val in enumerate(row):
        if (str(row[j]).strip() == 'nan'):
            print("> Deleting row: " + str(row))
            rows_to_delete.append(i)
            break
clean_data = np.delete(clean_data, rows_to_delete, 0)

# don't include the last column; where the labels are
data = (clean_data[:,:-1])

# reshape from (m,) to (m,1), then convert into one-hot vector (m,k)
y = one_hot((clean_data[:,-1]).reshape((-1, 1)))
print("\n  data matrix shape: " + str(data.shape))
print("  labels (y) shape: " + str(y.shape) + '\n')

train_perc = .7 # percentage of total data used for training
x_train, x_test, y_train, y_test = split_data(data, y, train_perc) # randomly splitting up the data
m = x_train.shape[0] # number of tuples for training
n = data.shape[1] # number of features
k = len(y[0]) # number of classes

print("> m (training samples) = " + str(m) + "\n> n (num. features)= " + str(n) + "\n> k (num. classes) = " + str(k))

> Transforming labels into one-hot vectors...

  data matrix shape: (10, 5)
  labels (y) shape: (10, 5)

> m (training samples) = 7
> n (num. features)= 5
> k (num. classes) = 5


In [6]:
y_rand = one_hot(np.floor(np.random.rand(len(y_test),1)*5).astype(int))
print("\n  y_rand shape: " + str(y_rand.shape))

> Transforming labels into one-hot vectors...

  y_rand shape: (3, 2)
