# Font recognition - improved models with PCA


## Load data and train-validation split

**Data is loaded, observed and treated**

In [13]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('data/train_data.csv')
train_labels = pd.read_csv('data/train_labels.csv')


Labels are factorized and a full dataframe is constructed adding the encoded values as the last column

In [14]:
label_encoded, unique_labels = pd.factorize(train_labels['Font'])
labels = pd.DataFrame(label_encoded, columns=['label'])
df = pd.concat([train_data, labels], axis = 1)

**Train and validation split is conducted**

In [15]:
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1]
Y = df.iloc[:, -1]
x_train_df, x_valid_df, y_train_df, y_valid_df = train_test_split(X, Y, test_size=0.3, random_state = 0)

**Finally, the test data is loaded as well**

In [16]:
test_data = pd.read_csv('data/test_data.csv')
x_test_df = test_data

## Normalization of data

Now df has all the needed information. It will be transformed to a np.array for easier treatment within sklearn package

In [17]:
x_train_pre_norm = np.array(x_train_df)
x_valid_pre_norm = np.array(x_valid_df)
y_train = np.array(y_train_df)
y_valid = np.array(y_valid_df)
x_test_pre_norm = np.array(x_test_df)

X_np = np.array(X)

`mean` and `std` are obtained from full dataset

In [18]:
mean = np.sum(X_np, axis = 0) / X_np.shape[0]
std = np.std(X_np, axis = 0)

Implement normalization function from Homework 9

In [19]:
def normalize(X, mean, std):
    """Normalizes a given array X by columns 
    with the mean and std"""
    X_out = np.zeros(X.shape)
    X_out = (X - mean)/std
    return X_out 

In [20]:
x_train = normalize(x_train_pre_norm, mean, std)
x_valid = normalize(x_valid_pre_norm, mean, std)
x_test = normalize(x_test_pre_norm, mean, std)

## Function to save submission csv

A function will be created that saves predictions as a csv with the correct format

In [21]:
def predictions_as_csv(y_pred, file_name):
    path = "submissions/"
    status = 0
    if len(y_pred) == 29221:
        ids = np.arange(1,len(y_pred)+1,1)
        pred_label = unique_labels[y_pred]
        data = {'ID':ids, 'Font':pred_label} 
        submission = pd.DataFrame(data)
        submission.to_csv(path + file_name + ".csv", index = False)
        status = 1
    
    return status

## PCA before applying neural network

In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss

In [23]:
from sklearn.decomposition import PCA

pca = PCA()
x_train = pca.fit_transform(x_train)
x_valid = pca.transform(x_valid)

## Neural network alpha = 0.5

In [24]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss

In [25]:
model_nNetwork = MLPClassifier(random_state=1, max_iter=300, alpha = 0.5, activation = 'logistic')
model_nNetwork.fit(x_train, y_train)

MLPClassifier(activation='logistic', alpha=0.5, max_iter=300, random_state=1)

In [26]:
y_pred_train = model_nNetwork.predict(x_train)
error = hamming_loss(y_train, y_pred_train)
print('The training error is: ' + str(error) + '.')

The training error is: 0.30257142857142855.


In [27]:
y_pred_valid = model_nNetwork.predict(x_valid)
error = hamming_loss(y_valid, y_pred_valid)
print('The validation error is: ' + str(error) + '.')

The validation error is: 0.36938461538461537.


In [28]:
predicted_score = 1- error
predicted_score

0.6306153846153846

## NN

In [64]:
model_nNetwork = MLPClassifier(random_state=1, max_iter=1000, alpha = 1.2, activation = 'relu', learning_rate = 'adaptive', hidden_layer_sizes=(200,100,100,50) )
model_nNetwork.fit(x_train, y_train)

MLPClassifier(alpha=1.2, hidden_layer_sizes=(200, 100, 100, 50),
              learning_rate='adaptive', max_iter=1000, random_state=1)

In [65]:
y_pred_train = model_nNetwork.predict(x_train)
error = hamming_loss(y_train, y_pred_train)
print('The training error is: ' + str(error) + '.')

The training error is: 0.14516483516483517.


In [66]:
y_pred_valid = model_nNetwork.predict(x_valid)
error = hamming_loss(y_valid, y_pred_valid)
print('The validation error is: ' + str(error) + '.')

The validation error is: 0.2641025641025641.


In [67]:
predicted_score = 1- error
predicted_score

0.735897435897436