# Font recognition - improved models


## Load data and train-validation split

**Data is loaded, observed and treated**

In [1]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('data/train_data.csv')
train_labels = pd.read_csv('data/train_labels.csv')


Labels are factorized and a full dataframe is constructed adding the encoded values as the last column

In [2]:
label_encoded, unique_labels = pd.factorize(train_labels['Font'])
labels = pd.DataFrame(label_encoded, columns=['label'])
df = pd.concat([train_data, labels], axis = 1)

**Train and validation split is conducted**

In [3]:
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1]
Y = df.iloc[:, -1]
x_train_df, x_valid_df, y_train_df, y_valid_df = train_test_split(X, Y, test_size=0.3, random_state = 0)

**Finally, the test data is loaded as well**

In [4]:
test_data = pd.read_csv('data/test_data.csv')
x_test_df = test_data

## Normalization of data

Now df has all the needed information. It will be transformed to a np.array for easier treatment within sklearn package

In [5]:
x_train_pre_norm = np.array(x_train_df)
x_valid_pre_norm = np.array(x_valid_df)
y_train = np.array(y_train_df)
y_valid = np.array(y_valid_df)
x_test_pre_norm = np.array(x_test_df)

X_np = np.array(X)

`mean` and `std` are obtained from full dataset

In [6]:
mean = np.sum(X_np, axis = 0) / X_np.shape[0]
std = np.std(X_np, axis = 0)

Implement normalization function from Homework 9

In [7]:
def normalize(X, mean, std):
    """Normalizes a given array X by columns 
    with the mean and std"""
    X_out = np.zeros(X.shape)
    X_out = (X - mean)/std
    return X_out 

In [8]:
x_train = normalize(x_train_pre_norm, mean, std)
x_valid = normalize(x_valid_pre_norm, mean, std)
x_test = normalize(x_test_pre_norm, mean, std)

## Function to save submission csv

A function will be created that saves predictions as a csv with the correct format

In [9]:
def predictions_as_csv(y_pred, file_name):
    path = "submissions/"
    status = 0
    if len(y_pred) == 29221:
        ids = np.arange(1,len(y_pred)+1,1)
        pred_label = unique_labels[y_pred]
        data = {'ID':ids, 'Font':pred_label} 
        submission = pd.DataFrame(data)
        submission.to_csv(path + file_name + ".csv", index = False)
        status = 1
    
    return status

## Neural network

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss

In [11]:
model_nNetwork = MLPClassifier(random_state=1, max_iter=300)
model_nNetwork.fit(x_train, y_train)

MLPClassifier(max_iter=300, random_state=1)

In [12]:
y_pred_train = model_nNetwork.predict(x_train)
error = hamming_loss(y_train, y_pred_train)
print('The training error is: ' + str(error) + '.')

The training error is: 0.04342857142857143.


In [13]:
y_pred_valid = model_nNetwork.predict(x_valid)
error = hamming_loss(y_valid, y_pred_valid)
print('The validation error is: ' + str(error) + '.')

The validation error is: 0.3304102564102564.


In [14]:
predicted_score = 1- error
predicted_score

0.6695897435897435

**Predictions with test set are computed**

In [15]:
y_pred_test = model_nNetwork.predict(x_test)

In [16]:
predictions_as_csv(y_pred_test, "nNetwork_prediction")

1

## Neural network alpha = 0.5

In [17]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss

In [18]:
model_nNetwork = MLPClassifier(random_state=1, max_iter=300, alpha = 0.5, activation = 'logistic')
model_nNetwork.fit(x_train, y_train)

MLPClassifier(activation='logistic', alpha=0.5, max_iter=300, random_state=1)

In [19]:
y_pred_train = model_nNetwork.predict(x_train)
error = hamming_loss(y_train, y_pred_train)
print('The training error is: ' + str(error) + '.')

The training error is: 0.320989010989011.


In [20]:
y_pred_valid = model_nNetwork.predict(x_valid)
error = hamming_loss(y_valid, y_pred_valid)
print('The validation error is: ' + str(error) + '.')

The validation error is: 0.38271794871794873.


In [21]:
predicted_score = 1- error
predicted_score

0.6172820512820513

**Predictions with test set are computed**

In [22]:
y_pred_test = model_nNetwork.predict(x_test)

In [23]:
predictions_as_csv(y_pred_test, "nNetwork_prediction2")

1

## Neural network varying params

A variation of parameters is tried to see f performance can be increased. It takes too much time to try out all compbinations so several of them were tried and results are shown below.

In [24]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss

In [25]:
activation_functions = ['logistic', 'tanh', 'relu', 'identity']
solvers = ['adam']
learning_rates = ['adaptive']
alphas = np.linspace(start=1, stop =10, num = 10)

In [26]:
rows_NN_performance = []
columns = ['activation_function', 'solver', 'learning_rate', 'alpha', 'train_error', 'test_error', 'predicted_score']


In [27]:
for activation_function in activation_functions:
    print("\n=============")
    print("Activation function")
    print(activation_function)
    for solver in solvers:
        print("\n======")
        print("Solver")
        print(solver)
        
        for learning_rate in learning_rates:
            print("=")
            print("L rate")
            print(learning_rate)
            
            for alpha in alphas:
                print(alpha)
                model_nNetwork = MLPClassifier(random_state=1, max_iter=600, alpha = alpha, activation = activation_function, solver = solver)
                model_nNetwork.fit(x_train, y_train)
                
                y_pred_train = model_nNetwork.predict(x_train)
                train_error = hamming_loss(y_train, y_pred_train)
                
                y_pred_valid = model_nNetwork.predict(x_valid)
                test_error = hamming_loss(y_valid, y_pred_valid)
                
                predicted_score = 1- test_error
                predicted_score
                
                rows_NN_performance.append(dict(zip(columns,[activation_function, solver, learning_rate, alpha, train_error, test_error, predicted_score ])))                

df = pd.DataFrame(rows_NN_performance)                


Activation function
logistic

Solver
adam
=
L rate
adaptive
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0

Activation function
tanh

Solver
adam
=
L rate
adaptive
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0

Activation function
relu

Solver
adam
=
L rate
adaptive
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0

Activation function
identity

Solver
adam
=
L rate
adaptive
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0


In [28]:
df

Unnamed: 0,activation_function,solver,learning_rate,alpha,train_error,test_error,predicted_score
0,logistic,adam,adaptive,1.0,0.435033,0.463846,0.536154
1,logistic,adam,adaptive,2.0,0.529516,0.543949,0.456051
2,logistic,adam,adaptive,3.0,0.57611,0.584051,0.415949
3,logistic,adam,adaptive,4.0,0.597077,0.607846,0.392154
4,logistic,adam,adaptive,5.0,0.60989,0.619385,0.380615
5,logistic,adam,adaptive,6.0,0.621099,0.629436,0.370564
6,logistic,adam,adaptive,7.0,0.632066,0.637692,0.362308
7,logistic,adam,adaptive,8.0,0.64633,0.650615,0.349385
8,logistic,adam,adaptive,9.0,0.674066,0.678974,0.321026
9,logistic,adam,adaptive,10.0,0.685319,0.689077,0.310923


## Neural network more hidden layers

In [29]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss

In [30]:
model_nNetwork = MLPClassifier(random_state=1, max_iter=500, alpha = 0.5, activation = 'logistic',  hidden_layer_sizes=(10, 6))
model_nNetwork.fit(x_train, y_train)

MLPClassifier(activation='logistic', alpha=0.5, hidden_layer_sizes=(10, 6),
              max_iter=500, random_state=1)

In [31]:
y_pred_train = model_nNetwork.predict(x_train)
error = hamming_loss(y_train, y_pred_train)
print('The training error is: ' + str(error) + '.')

The training error is: 0.4857582417582418.


In [32]:
y_pred_valid = model_nNetwork.predict(x_valid)
error = hamming_loss(y_valid, y_pred_valid)
print('The validation error is: ' + str(error) + '.')

The validation error is: 0.5049743589743589.


In [33]:
predicted_score = 1- error
predicted_score

0.49502564102564106

## Random search

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss
from scipy.stats import uniform

In [35]:
activation_functions = ['logistic', 'tanh', 'relu', 'identity']
solvers = ['adam','lbfgs', 'sgd']
learning_rates = ['adaptive', 'invscaling']
alphas = np.linspace(start=0.1, stop =3, num = 10).tolist()

In [36]:
distributions = {'activation':('logistic', 'tanh', 'relu', 'identity'), 'solver':('adam','lbfgs', 'sgd'), 'learning_rate':('adaptive', 'invscaling'), 'alpha':alphas}

In [37]:
neural_NN = MLPClassifier(max_iter=1000, random_state = 1)

In [38]:
#clf = GridSearchCV(neural_NN, parameters)
clf = RandomizedSearchCV(neural_NN, distributions)

In [39]:
clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

RandomizedSearchCV(estimator=MLPClassifier(max_iter=1000, random_state=1),
                   param_distributions={'activation': ('logistic', 'tanh',
                                                       'relu', 'identity'),
                                        'alpha': [0.1, 0.42222222222222217,
                                                  0.7444444444444444,
                                                  1.0666666666666667,
                                                  1.3888888888888888,
                                                  1.711111111111111,
                                                  2.033333333333333,
                                                  2.3555555555555556,
                                                  2.6777777777777776, 3.0],
                                        'learning_rate': ('adaptive',
                                                          'invscaling'),
                                        'solver': ('adam', 'l

In [40]:
clf.best_params_

{'solver': 'sgd',
 'learning_rate': 'adaptive',
 'alpha': 1.0666666666666667,
 'activation': 'relu'}

In [41]:
y_pred_train = clf.predict(x_train)
error = hamming_loss(y_train, y_pred_train)
print('The training error is: ' + str(error) + '.')

The training error is: 0.20125274725274725.


In [42]:
y_pred_valid = clf.predict(x_valid)
error = hamming_loss(y_valid, y_pred_valid)
print('The validation error is: ' + str(error) + '.')

The validation error is: 0.32743589743589746.


In [43]:
predicted_score = 1- error
predicted_score

0.6725641025641025

In [44]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_activation',
 'param_alpha',
 'param_learning_rate',
 'param_solver',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

## Deep neural network

In [45]:
model_nNetwork = MLPClassifier(random_state=1, max_iter=1000, alpha = 1.38, activation = 'relu', learning_rate = 'adaptive', solver = 'sgd', hidden_layer_sizes=(200,100,100,50) )
model_nNetwork.fit(x_train, y_train)

y_pred_train = model_nNetwork.predict(x_train)
error = hamming_loss(y_train, y_pred_train)
print('The training error is: ' + str(error) + '.')

y_pred_valid = model_nNetwork.predict(x_valid)
error = hamming_loss(y_valid, y_pred_valid)
print('The validation error is: ' + str(error) + '.')

predicted_score = 1- error
predicted_score

The training error is: 0.022395604395604396.
The validation error is: 0.2753333333333333.


0.7246666666666667