In [2]:
# Group: Minh Nguyen & Bach Ha

import numpy as np

training_data = np.genfromtxt("training_data.txt", dtype='str')

In [50]:
from collections import Counter

# print(training_data[0][0][0])
CHAR_DICT = Counter("ONETWOTHREEFOURFIVESIXSEVENEIGHTNINEZERO")
DIGIT_DICT = Counter("0123456789")


def count_letter(string):
    """
    count number of letters in a string, expects only chars in CHAR_DICT.keys()
    :param string: character string as from training_data
    :return: list of ordered character counts
    """
    char_dict_from_string = dict.fromkeys(CHAR_DICT.keys(), 0)

    for character in string:
        char_dict_from_string[character] += 1

    char_count_list = []
    for character in sorted(char_dict_from_string.keys()):
        char_count_list.append(char_dict_from_string[character])

    return char_count_list


def count_digit(string):
    """
    count number of digits in a string, expects only chars in DIGIT_DICT.keys()
    :param string: number string as from training_data
    :return: list of ordered digit counts
    """
    digit_dict_from_string = dict.fromkeys(DIGIT_DICT.keys(), 0)

    for digit in string:
        digit_dict_from_string[digit] += 1

    digit_count_list = []
    for digit in sorted(digit_dict_from_string.keys()):
        digit_count_list.append(digit_dict_from_string[digit])

    # return list(digit_dict_from_string.values())
    return digit_count_list


def get_number_string_from_digit_counts(digit_count_list):
    """
    get a string of digit from list of digit counts
    :param digit_count_list: 
    :return: 
    """
    number_string = ""
    for i in range(len(digit_count_list)):
        for count in range(int(digit_count_list[i])):
            number_string += str(i)

    return number_string


def get_prediction(string_list, estimator):
    """
    
    :param string_list: 
    :param estimator: 
    :return: 
    """
    char_count_all_lists = None
    for string in string_list:
        char_count_list = count_letter(string)
        char_count_list = np.reshape(char_count_list, (1, len(char_count_list)))
        if char_count_all_lists is None:
            char_count_all_lists = char_count_list
        else:
            char_count_all_lists = np.append(char_count_all_lists,
                                             char_count_list, axis=0)

    digit_count_all_lists = estimator.predict(char_count_all_lists)
    number_strings = []
    for digit_count_list in digit_count_all_lists:
        number_strings.append(get_number_string_from_digit_counts(digit_count_list))
    return number_strings


def compare_result(prediction, actual_output):
    """
    
    :param prediction: 
    :param actual_output: 
    :return: 
    """
    for i in range(len(prediction)):
        print("prediction ", get_number_string_from_digit_counts(prediction[i]),
              " actual: ", actual_output[i])

In [46]:
char_count_full = None
digit_count_full = None
for entry in training_data[:10000]:
    digit_count = count_digit(entry[0])
    digit_count = np.reshape(digit_count, (1, len(digit_count)))
    char_count = count_letter(entry[1])
    char_count = np.reshape(char_count, (1, len(char_count)))

    if char_count_full is None:
        char_count_full = char_count
    else:
        char_count_full = np.append(char_count_full, char_count, axis=0)

    if digit_count_full is None:
        digit_count_full = digit_count
    else:
        digit_count_full = np.append(digit_count_full, digit_count, axis=0)

In [47]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

test_size = 20
train_size = len(char_count_full) - test_size

train_set = char_count_full[:train_size]
train_output = digit_count_full[:train_size]
test_set = char_count_full[train_size: train_size + test_size]
test_true_output = digit_count_full[train_size: train_size + test_size]

# Fit estimators
ESTIMATORS = {
    "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=len(CHAR_DICT.keys()),
                                       random_state=0),
    "K-nn": KNeighborsRegressor(),
    "Linear regression": LinearRegression(),
    "Ridge": RidgeCV(),
}

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator.fit(train_set, train_output)
    y_test_predict[name] = np.round(estimator.predict(test_set))

In [51]:
new_train_set = training_data[len(char_count_full): len(char_count_full) + 10]
print(new_train_set[0, 0])
digit_dict_from_string = dict.fromkeys(DIGIT_DICT.keys(), 0)
print(count_digit(new_train_set[0, 0]))
print(get_number_string_from_digit_counts(count_digit(new_train_set[0, 0])))
print(new_train_set[:, 0])
get_prediction(new_train_set[:, 1],
               ESTIMATORS["Extra trees"])


00367777799
[2, 0, 0, 1, 0, 0, 1, 5, 0, 2]
00367777799
['00367777799' '11223666899' '01233566689' '00023455789' '02233467899'
 '24445566789' '0134556777' '02468888999' '23346677779' '01112333679']


['07779',
 '1226689',
 '01235668',
 '00023455789',
 '23389',
 '4455668',
 '1456777',
 '0688899',
 '366777',
 '0112337']

In [52]:
from scipy.spatial.distance import euclidean

for name, prediction in y_test_predict.items():
    error_average = 0.0
    for i in range(len(prediction)):
        error_average += euclidean(prediction[i], test_true_output[i])
    error_average /= len(prediction)
    print("average error for ", name, " is ", error_average, "\n")

average error for  Linear regression  is  0.0 

average error for  K-nn  is  0.786067467587 

average error for  Extra trees  is  0.805317199614 

average error for  Ridge  is  0.0 



In [53]:
training_data_test_portion = training_data[train_size: train_size + test_size]
compare_result(y_test_predict["Linear regression"], training_data_test_portion[:, 0])

prediction  11155677778  actual:  11155677778
prediction  11133446779  actual:  11133446779
prediction  13334677899  actual:  13334677899
prediction  0014467889  actual:  0014467889
prediction  11245567899  actual:  11245567899
prediction  00223356678  actual:  00223356678
prediction  0034455579  actual:  0034455579
prediction  00012444566  actual:  00012444566
prediction  12237778899  actual:  12237778899
prediction  00123334679  actual:  00123334679
prediction  22223555668  actual:  22223555668
prediction  01122338889  actual:  01122338889
prediction  11334467789  actual:  11334467789
prediction  01344456889  actual:  01344456889
prediction  13333478999  actual:  13333478999
prediction  0113445679  actual:  0113445679
prediction  01456778899  actual:  01456778899
prediction  11346777888  actual:  11346777888
prediction  12233334588  actual:  12233334588
prediction  13445566788  actual:  13445566788


In [54]:
compare_result(y_test_predict["Extra trees"], training_data_test_portion[:, 0])

prediction  13556777789  actual:  11155677778
prediction  1133446779  actual:  11133446779
prediction  13334677899  actual:  13334677899
prediction  0014467889  actual:  0014467889
prediction  11245567899  actual:  11245567899
prediction  00223356678  actual:  00223356678
prediction  00134455579  actual:  0034455579
prediction  00023444566  actual:  00012444566
prediction  11223777889  actual:  12237778899
prediction  00123334679  actual:  00123334679
prediction  12225556689  actual:  22223555668
prediction  01122338888  actual:  01122338889
prediction  1334467789  actual:  11334467789
prediction  01344456889  actual:  01344456889
prediction  01333347899  actual:  13333478999
prediction  0113445679  actual:  0113445679
prediction  01456778899  actual:  01456778899
prediction  113467778889  actual:  11346777888
prediction  122333788  actual:  12233334588
prediction  13445566788  actual:  13445566788


In [55]:
compare_result(y_test_predict["K-nn"], training_data_test_portion[:, 0])

prediction  011355677789  actual:  11155677778
prediction  11133446779  actual:  11133446779
prediction  13334677899  actual:  13334677899
prediction  0014467889  actual:  0014467889
prediction  01245567899  actual:  11245567899
prediction  00223356678  actual:  00223356678
prediction  0134455579  actual:  0034455579
prediction  00012444566  actual:  00012444566
prediction  11223778899  actual:  12237778899
prediction  00123346789  actual:  00123334679
prediction  12223555668  actual:  22223555668
prediction  01122338889  actual:  01122338889
prediction  1334467789  actual:  11334467789
prediction  01344456889  actual:  01344456889
prediction  133347899  actual:  13333478999
prediction  0113445679  actual:  0113445679
prediction  01456778899  actual:  01456778899
prediction  1134677788  actual:  11346777888
prediction  223334788  actual:  12233334588
prediction  3445566788  actual:  13445566788


In [56]:
compare_result(y_test_predict["Ridge"], training_data_test_portion[:, 0])

prediction  11155677778  actual:  11155677778
prediction  11133446779  actual:  11133446779
prediction  13334677899  actual:  13334677899
prediction  0014467889  actual:  0014467889
prediction  11245567899  actual:  11245567899
prediction  00223356678  actual:  00223356678
prediction  0034455579  actual:  0034455579
prediction  00012444566  actual:  00012444566
prediction  12237778899  actual:  12237778899
prediction  00123334679  actual:  00123334679
prediction  22223555668  actual:  22223555668
prediction  01122338889  actual:  01122338889
prediction  11334467789  actual:  11334467789
prediction  01344456889  actual:  01344456889
prediction  13333478999  actual:  13333478999
prediction  0113445679  actual:  0113445679
prediction  01456778899  actual:  01456778899
prediction  11346777888  actual:  11346777888
prediction  12233334588  actual:  12233334588
prediction  13445566788  actual:  13445566788
