In [70]:
# Group: Minh Nguyen & Bach Ha

import numpy as np

training_data = np.genfromtxt("lab/training_data.txt", dtype='str')

In [71]:
from collections import Counter

# print(training_data[0][0][0])
CHAR_DICT = Counter("ONETWOTHREEFOURFIVESIXSEVENEIGHTNINEZERO")
DIGIT_DICT = Counter("0123456789")


def count_letter(string):
    """
    
    :param string: 
    :return: 
    """
    char_dict_from_string = Counter(string)

    for character in CHAR_DICT:
        if character not in char_dict_from_string:
            char_dict_from_string[character] = 0

    return list(char_dict_from_string.values())


def count_digit(string):
    """
    
    :param string: 
    :return: 
    """
    digit_dict_from_string = Counter(string)

    for digit in DIGIT_DICT:
        if digit not in digit_dict_from_string:
            digit_dict_from_string[digit] = 0

    return list(digit_dict_from_string.values())


def get_number_from_digit_counts(prediction):
    """
    
    :param prediction: 
    :return: 
    """
    digit_dict = dict()

    index = 0
    for digit in DIGIT_DICT.keys():
        digit_dict[digit] = prediction[index]
        index += 1

    number_string = ""
    for digit in sorted(digit_dict.keys()):
        for count in range(int(digit_dict[digit])):
            number_string += digit

    return number_string


def get_prediction(string, estimator):
    digit_count_list = estimator.predict()
    return


def compare_result(prediction, actual_output):
    """
    
    :param prediction: 
    :param actual_output: 
    :return: 
    """
    for i in range(len(prediction)):
        print("prediction ", get_number_from_prediction(prediction[i]),
              " actual: ", actual_output[i])



In [73]:
char_count_full = None
digit_count_full = None
for entry in training_data[:10000]:
    digit_count = count_digit(entry[0])
    digit_count = np.reshape(digit_count, (1, len(digit_count)))
    char_count = count_letter(entry[1])
    char_count = np.reshape(char_count, (1, len(char_count)))

    if char_count_full is None:
        char_count_full = char_count
    else:
        char_count_full = np.append(char_count_full, char_count, axis=0)

    if digit_count_full is None:
        digit_count_full = digit_count
    else:
        digit_count_full = np.append(digit_count_full, digit_count, axis=0)

In [74]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

test_size = 20
train_size = len(char_count_full) - test_size

train_set = char_count_full[:train_size]
train_output = digit_count_full[:train_size]
test_set = char_count_full[train_size: train_size + test_size]
test_true_output = digit_count_full[train_size: train_size + test_size]

# Fit estimators
ESTIMATORS = {
    "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=len(CHAR_DICT.keys()),
                                       random_state=0),
    "K-nn": KNeighborsRegressor(),
    "Linear regression": LinearRegression(),
    "Ridge": RidgeCV(),
}

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator.fit(train_set, train_output)
    y_test_predict[name] = np.round(estimator.predict(test_set))

In [75]:
from scipy.spatial.distance import euclidean

for name, prediction in y_test_predict.items():
    error_average = 0.0
    for i in range(len(prediction)):
        error_average += euclidean(prediction[i], test_true_output[i])
    error_average /= len(prediction)
    print("average error for ", name, " is ", error_average, "\n")

average error for  K-nn  is  1.8097105505 

average error for  Linear regression  is  2.55874265432 

average error for  Ridge  is  2.55874265432 

average error for  Extra trees  is  1.69166923943 



In [75]:

training_data_test_portion = training_data[train_size: train_size + test_size]
compare_result(y_test_predict["Linear regression"], training_data_test_portion[:, 0])

prediction  02234589999  actual:  02234589999
prediction  02234667899  actual:  02223466799
prediction  00122355889  actual:  00122355889
prediction  00122556889  actual:  00122556889
prediction  01233367789  actual:  01233367789
prediction  22446667899  actual:  22446667899
prediction  01223556899  actual:  01223556899
prediction  12234557789  actual:  12234557789
prediction  02233444689  actual:  03344468889
prediction  01111223456  actual:  01111223456
prediction  01233466789  actual:  01334667889
prediction  12233457889  actual:  12223345789
prediction  01236667899  actual:  01236667899
prediction  01235567899  actual:  01235567899
prediction  01233344668  actual:  01333446688
prediction  00112234578  actual:  00112345788
prediction  01233566779  actual:  01233566779
prediction  12456677789  actual:  14566777889
prediction  01223445668  actual:  01234456688
prediction  01234446778  actual:  01234446778


In [76]:
compare_result(y_test_predict["Extra trees"], training_data_test_portion[:, 0])

prediction  0223458999  actual:  02234589999
prediction  02223466799  actual:  02223466799
prediction  00122355889  actual:  00122355889
prediction  00122556889  actual:  00122556889
prediction  01233367789  actual:  01233367789
prediction  24466678899  actual:  22446667899
prediction  01223556899  actual:  01223556899
prediction  12234557789  actual:  12234557789
prediction  02223344469  actual:  03344468889
prediction  011122346  actual:  01111223456
prediction  01223346679  actual:  01334667889
prediction  12223345789  actual:  12223345789
prediction  01236667899  actual:  01236667899
prediction  01235567899  actual:  01235567899
prediction  0223334466  actual:  01333446688
prediction  00112345788  actual:  00112345788
prediction  01233566779  actual:  01233566779
prediction  12245667779  actual:  14566777889
prediction  01234456688  actual:  01234456688
prediction  01234446778  actual:  01234446778


In [77]:
compare_result(y_test_predict["K-nn"], training_data_test_portion[:, 0])

prediction  01223458999  actual:  02234589999
prediction  02223466799  actual:  02223466799
prediction  00122355889  actual:  00122355889
prediction  00122556889  actual:  00122556889
prediction  01233367789  actual:  01233367789
prediction  12244666789  actual:  22446667899
prediction  01223556899  actual:  01223556899
prediction  12234557789  actual:  12234557789
prediction  02233444569  actual:  03344468889
prediction  0111223456  actual:  01111223456
prediction  01223346679  actual:  01334667889
prediction  12223345789  actual:  12223345789
prediction  01236667899  actual:  01236667899
prediction  01235567899  actual:  01235567899
prediction  0223334466  actual:  01333446688
prediction  00112345788  actual:  00112345788
prediction  01233566779  actual:  01233566779
prediction  12245667779  actual:  14566777889
prediction  01234456688  actual:  01234456688
prediction  0123446778  actual:  01234446778


In [78]:
compare_result(y_test_predict["Ridge"], training_data_test_portion[:, 0])

prediction  02234589999  actual:  02234589999
prediction  02234667899  actual:  02223466799
prediction  00122355889  actual:  00122355889
prediction  00122556889  actual:  00122556889
prediction  01233367789  actual:  01233367789
prediction  22446667899  actual:  22446667899
prediction  01223556899  actual:  01223556899
prediction  12234557789  actual:  12234557789
prediction  02233444689  actual:  03344468889
prediction  01111223456  actual:  01111223456
prediction  01233466789  actual:  01334667889
prediction  12233457889  actual:  12223345789
prediction  01236667899  actual:  01236667899
prediction  01235567899  actual:  01235567899
prediction  01233344668  actual:  01333446688
prediction  00112234578  actual:  00112345788
prediction  01233566779  actual:  01233566779
prediction  12456677789  actual:  14566777889
prediction  01223445668  actual:  01234456688
prediction  01234446778  actual:  01234446778
