In [57]:
# Group: Minh Nguyen & Bach Ha

import numpy as np

training_data = np.genfromtxt("training_data.txt", dtype='str')

In [81]:
from collections import Counter

# print(training_data[0][0][0])
CHAR_DICT = Counter("ONETWOTHREEFOURFIVESIXSEVENEIGHTNINEZERO")
DIGIT_DICT = Counter("0123456789")


def count_letter(string):
    """
    count number of letters in a string, expects only chars in CHAR_DICT.keys()
    :param string: character string as from training_data
    :return: list of ordered character counts
    """
    char_dict_from_string = dict.fromkeys(CHAR_DICT.keys(), 0)

    for character in string:
        char_dict_from_string[character] += 1

    char_count_list = []
    for character in sorted(char_dict_from_string.keys()):
        char_count_list.append(char_dict_from_string[character])

    return char_count_list


def count_digit(string):
    """
    count number of digits in a string, expects only chars in DIGIT_DICT.keys()
    :param string: number string as from training_data
    :return: list of ordered digit counts
    """
    digit_dict_from_string = dict.fromkeys(DIGIT_DICT.keys(), 0)

    for digit in string:
        digit_dict_from_string[digit] += 1

    digit_count_list = []
    for digit in sorted(digit_dict_from_string.keys()):
        digit_count_list.append(digit_dict_from_string[digit])

    # return list(digit_dict_from_string.values())
    return digit_count_list


def get_number_string_from_digit_counts(digit_count_list):
    """
    get a string of digit from list of digit counts
    :param digit_count_list: list of ordered digit counts
    :return: number string as from training_data
    """
    number_string = ""
    for i in range(len(digit_count_list)):
        for count in range(int(digit_count_list[i])):
            number_string += str(i)

    return number_string


def get_prediction(string_list, estimator):
    """
    run predict() on the given string using the given estimator,
    then convert list of digit counts to a number string
    :param string_list: list of character strings as from training_data
    :param estimator: object with a predict() function
    :return: list of number strings as from training_data
    """
    char_count_all_lists = None
    for string in string_list:
        char_count_list = count_letter(string)
        char_count_list = np.reshape(char_count_list, (1, len(char_count_list)))
        if char_count_all_lists is None:
            char_count_all_lists = char_count_list
        else:
            char_count_all_lists = np.append(char_count_all_lists,
                                             char_count_list, axis=0)

    digit_count_all_lists = np.round(estimator.predict(char_count_all_lists))
    number_strings = []
    for digit_count_list in digit_count_all_lists:
        number_strings.append(get_number_string_from_digit_counts(digit_count_list))
    return number_strings



In [88]:
char_count_full = None
digit_count_full = None
for entry in training_data[:10000]:
    digit_count = count_digit(entry[0])
    digit_count = np.reshape(digit_count, (1, len(digit_count)))
    char_count = count_letter(entry[1])
    char_count = np.reshape(char_count, (1, len(char_count)))

    if char_count_full is None:
        char_count_full = char_count
    else:
        char_count_full = np.append(char_count_full, char_count, axis=0)

    if digit_count_full is None:
        digit_count_full = digit_count
    else:
        digit_count_full = np.append(digit_count_full, digit_count, axis=0)

In [89]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

test_size = 20
train_size = len(char_count_full) - test_size

train_set = char_count_full[:train_size]
train_output = digit_count_full[:train_size]
test_set = char_count_full[train_size: train_size + test_size]
test_true_output = digit_count_full[train_size: train_size + test_size]

# Fit estimators
ESTIMATORS = {
    "Extra trees": ExtraTreesRegressor(n_estimators=10, max_features=len(CHAR_DICT.keys()),
                                       random_state=0),
    "K-nn": KNeighborsRegressor(),
    "Linear regression": LinearRegression(),
    "Ridge": RidgeCV(),
}

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator.fit(train_set, train_output)
    y_test_predict[name] = np.round(estimator.predict(test_set))


In [90]:
from scipy.spatial.distance import euclidean

for name, prediction in y_test_predict.items():
    error_average = 0.0
    for i in range(len(prediction)):
        error_average += euclidean(prediction[i], test_true_output[i])
    error_average /= len(prediction)
    print("average error for ", name, " is ", error_average, "\n")

average error for  Linear regression  is  0.0 

average error for  K-nn  is  0.786067467587 

average error for  Extra trees  is  0.805317199614 

average error for  Ridge  is  0.0 



In [91]:
new_test_size = 10
new_train_set = training_data[len(char_count_full): len(char_count_full) + new_test_size]

tree_prediction = get_prediction(new_train_set[:, 1], ESTIMATORS["Extra trees"])

for i in range(new_test_size):
    print("actual numbers: %20s\nprediction    : %20s\n"
          % (new_train_set[i, 0], tree_prediction[i]))


actual numbers:          00367777799
prediction    :         001356777789

actual numbers:          11223666899
prediction    :           1223666899

actual numbers:          01233566689
prediction    :          01233566689

actual numbers:          00023455789
prediction    :          00023455789

actual numbers:          02233467899
prediction    :          01223346789

actual numbers:          24445566789
prediction    :           4445566789

actual numbers:           0134556777
prediction    :          01345567779

actual numbers:          02468888999
prediction    :            026888899

actual numbers:          23346677779
prediction    :           1235667777

actual numbers:          01112333679
prediction    :          01112333679



In [92]:
linear_prediction = get_prediction(new_train_set[:, 1], ESTIMATORS["Linear regression"])

for i in range(new_test_size):
    print("actual numbers: %20s\nprediction    : %20s\n"
          % (new_train_set[i, 0], linear_prediction[i]))


actual numbers:          00367777799
prediction    :          00367777799

actual numbers:          11223666899
prediction    :          11223666899

actual numbers:          01233566689
prediction    :          01233566689

actual numbers:          00023455789
prediction    :          00023455789

actual numbers:          02233467899
prediction    :          02233467899

actual numbers:          24445566789
prediction    :          24445566789

actual numbers:           0134556777
prediction    :           0134556777

actual numbers:          02468888999
prediction    :          02468888999

actual numbers:          23346677779
prediction    :          23346677779

actual numbers:          01112333679
prediction    :          01112333679



In [93]:
knn_prediction = get_prediction(new_train_set[:, 1], ESTIMATORS["K-nn"])

for i in range(new_test_size):
    print("actual numbers: %20s\nprediction    : %20s\n"
          % (new_train_set[i, 0], knn_prediction[i]))


actual numbers:          00367777799
prediction    :          01346777789

actual numbers:          11223666899
prediction    :          11226668899

actual numbers:          01233566689
prediction    :          01233566689

actual numbers:          00023455789
prediction    :           0023455789

actual numbers:          02233467899
prediction    :          01223346789

actual numbers:          24445566789
prediction    :          24445566789

actual numbers:           0134556777
prediction    :           0134556777

actual numbers:          02468888999
prediction    :          01246888899

actual numbers:          23346677779
prediction    :         123346677779

actual numbers:          01112333679
prediction    :           0111233679



In [94]:
ridge_prediction = get_prediction(new_train_set[:, 1], ESTIMATORS["Ridge"])

for i in range(new_test_size):
    print("actual numbers: %20s\nprediction    : %20s\n"
          % (new_train_set[i, 0], ridge_prediction[i]))


actual numbers:          00367777799
prediction    :          00367777799

actual numbers:          11223666899
prediction    :          11223666899

actual numbers:          01233566689
prediction    :          01233566689

actual numbers:          00023455789
prediction    :          00023455789

actual numbers:          02233467899
prediction    :          02233467899

actual numbers:          24445566789
prediction    :          24445566789

actual numbers:           0134556777
prediction    :           0134556777

actual numbers:          02468888999
prediction    :          02468888999

actual numbers:          23346677779
prediction    :          23346677779

actual numbers:          01112333679
prediction    :          01112333679

