In [2]:
# Group: Minh Nguyen & Bach Ha

import numpy as np

training_data = np.genfromtxt("training_data.txt", dtype='str')

In [16]:
from collections import Counter

# print(training_data[0][0][0])
CHAR_DICT = Counter("ONETWOTHREEFOURFIVESIXSEVENEIGHTNINEZERO")
DIGIT_DICT = Counter("0123456789")


def count_letter(string):
    """
    count number of letters in a string, expects only chars in CHAR_DICT.keys()
    NOTE: for some reason returning a dictionary achieve better result than 
    returning a list
    :param string: character string as from training_data
    :return: 
    """
    char_dict_from_string = Counter(string)

    for character in CHAR_DICT:
        if character not in char_dict_from_string:
            char_dict_from_string[character] = 0

    #return list(char_dict_from_string.values())
    return char_dict_from_string


def count_digit(string):
    """
    count number of digits in a string, expects only chars in DIGIT_DICT.keys()
    :param string: number string as from training_data
    :return: 
    """
    digit_dict_from_string = Counter(string)

    for digit in DIGIT_DICT:
        if digit not in digit_dict_from_string:
            digit_dict_from_string[digit] = 0

    #return list(digit_dict_from_string.values())
    return digit_dict_from_string


def get_number_string_from_digit_counts(prediction):
    """
    
    :param prediction: 
    :return: 
    """
    digit_dict = dict()

    index = 0
    for digit in DIGIT_DICT.keys():
        digit_dict[digit] = prediction[index]
        index += 1

    number_string = ""
    for digit in sorted(digit_dict.keys()):
        for count in range(int(digit_dict[digit])):
            number_string += digit

    return number_string


def get_prediction(string_list, estimator):
    """
    
    :param string_list: 
    :param estimator: 
    :return: 
    """
    char_count_all_lists = None
    for string in string_list:
        char_count_list = list(count_letter(string).values())
        char_count_list = np.reshape(char_count_list, (1, len(char_count_list)))
        if char_count_all_lists is None:
            char_count_all_lists = char_count_list
        else:
            char_count_all_lists = np.append(char_count_all_lists,
                                             char_count_list, axis=0)

    digit_count_all_lists = estimator.predict(char_count_all_lists)
    number_strings = []
    for digit_count_list in digit_count_all_lists:
        number_strings.append(get_number_string_from_digit_counts(digit_count_list))
    return number_strings


def compare_result(prediction, actual_output):
    """
    
    :param prediction: 
    :param actual_output: 
    :return: 
    """
    for i in range(len(prediction)):
        print("prediction ", get_number_string_from_digit_counts(prediction[i]),
              " actual: ", actual_output[i])

In [9]:
char_count_full = None
digit_count_full = None
for entry in training_data[:50000]:
    digit_count = list(count_digit(entry[0]).values())
    digit_count = np.reshape(digit_count, (1, len(digit_count)))
    char_count = list(count_letter(entry[1]).values())
    char_count = np.reshape(char_count, (1, len(char_count)))

    if char_count_full is None:
        char_count_full = char_count
    else:
        char_count_full = np.append(char_count_full, char_count, axis=0)

    if digit_count_full is None:
        digit_count_full = digit_count
    else:
        digit_count_full = np.append(digit_count_full, digit_count, axis=0)

In [10]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

test_size = 20
train_size = len(char_count_full) - test_size

train_set = char_count_full[:train_size]
train_output = digit_count_full[:train_size]
test_set = char_count_full[train_size: train_size + test_size]
test_true_output = digit_count_full[train_size: train_size + test_size]

# Fit estimators
ESTIMATORS = {
    "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=len(CHAR_DICT.keys()),
                                       random_state=0),
    "K-nn": KNeighborsRegressor(),
    "Linear regression": LinearRegression(),
    "Ridge": RidgeCV(),
}

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator.fit(train_set, train_output)
    y_test_predict[name] = np.round(estimator.predict(test_set))

In [11]:
from scipy.spatial.distance import euclidean

for name, prediction in y_test_predict.items():
    error_average = 0.0
    for i in range(len(prediction)):
        error_average += euclidean(prediction[i], test_true_output[i])
    error_average /= len(prediction)
    print("average error for ", name, " is ", error_average, "\n")

average error for  Linear regression  is  2.17588794126 

average error for  K-nn  is  1.25694789398 

average error for  Extra trees  is  0.752287965328 

average error for  Ridge  is  2.17588794126 



In [17]:
training_data_test_portion = training_data[train_size: train_size + test_size]
compare_result(y_test_predict["Linear regression"], training_data_test_portion[:, 0])

prediction  001335567788  actual:  01333567778
prediction  1223456889  actual:  11234556789
prediction  11334566789  actual:  11335556679
prediction  02235567889  actual:  01257778889
prediction  11334566889  actual:  11355667899
prediction  00234556789  actual:  00124577789
prediction  12344467889  actual:  01244455899
prediction  0223345678  actual:  01223345788
prediction  0123556789  actual:  01122556779
prediction  01123456689  actual:  01235556679
prediction  001234456689  actual:  00011345669
prediction  12234556789  actual:  11222345677
prediction  11233568899  actual:  11125679999
prediction  0013346789  actual:  0011123459
prediction  002233566788  actual:  00223678899
prediction  23345567889  actual:  13345577889
prediction  11234456689  actual:  12234555667
prediction  112344566689  actual:  12345556669
prediction  0123467889  actual:  00112455688
prediction  01123345668  actual:  0233456678


In [15]:
compare_result(y_test_predict["Extra trees"], training_data_test_portion[:, 0])

prediction  0133356778  actual:  01333567778
prediction  11234556778  actual:  11234556789
prediction  11133566899  actual:  11335556679
prediction  02223555789  actual:  01257778889
prediction  11344556678  actual:  11355667899
prediction  00245557889  actual:  00124577789
prediction  02334447889  actual:  01244455899
prediction  02233455778  actual:  01223345788
prediction  01122355699  actual:  01122556779
prediction  01112356689  actual:  01235556679
prediction  00011346689  actual:  00011345669
prediction  11222345569  actual:  11222345677
prediction  1233335699  actual:  11125679999
prediction  0034667889  actual:  0011123459
prediction  00122346788  actual:  00223678899
prediction  22334556889  actual:  13345577889
prediction  1122456699  actual:  12234555667
prediction  12346668999  actual:  12345556669
prediction  00123468899  actual:  00112455688
prediction  0123346678  actual:  0233456678


In [77]:
compare_result(y_test_predict["K-nn"], training_data_test_portion[:, 0])

prediction  01223458999  actual:  02234589999
prediction  02223466799  actual:  02223466799
prediction  00122355889  actual:  00122355889
prediction  00122556889  actual:  00122556889
prediction  01233367789  actual:  01233367789
prediction  12244666789  actual:  22446667899
prediction  01223556899  actual:  01223556899
prediction  12234557789  actual:  12234557789
prediction  02233444569  actual:  03344468889
prediction  0111223456  actual:  01111223456
prediction  01223346679  actual:  01334667889
prediction  12223345789  actual:  12223345789
prediction  01236667899  actual:  01236667899
prediction  01235567899  actual:  01235567899
prediction  0223334466  actual:  01333446688
prediction  00112345788  actual:  00112345788
prediction  01233566779  actual:  01233566779
prediction  12245667779  actual:  14566777889
prediction  01234456688  actual:  01234456688
prediction  0123446778  actual:  01234446778


In [78]:
compare_result(y_test_predict["Ridge"], training_data_test_portion[:, 0])

prediction  02234589999  actual:  02234589999
prediction  02234667899  actual:  02223466799
prediction  00122355889  actual:  00122355889
prediction  00122556889  actual:  00122556889
prediction  01233367789  actual:  01233367789
prediction  22446667899  actual:  22446667899
prediction  01223556899  actual:  01223556899
prediction  12234557789  actual:  12234557789
prediction  02233444689  actual:  03344468889
prediction  01111223456  actual:  01111223456
prediction  01233466789  actual:  01334667889
prediction  12233457889  actual:  12223345789
prediction  01236667899  actual:  01236667899
prediction  01235567899  actual:  01235567899
prediction  01233344668  actual:  01333446688
prediction  00112234578  actual:  00112345788
prediction  01233566779  actual:  01233566779
prediction  12456677789  actual:  14566777889
prediction  01223445668  actual:  01234456688
prediction  01234446778  actual:  01234446778
