In [1]:
from helper import read_data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

type(read_data)



function

In [2]:
RANDOM_SEED = 9
TRAIN_FILE_PATH = "./asset/training_data.txt"

VOWELS_TABLE = """\
|   AA  |   ɑ   |
|   AE  |   æ   |
|   AH  |   ʌ   |
|   AO  |   ɔ   |
|   AW  |   aʊ  |
|   AY  |   aɪ  |
|   EH  |   ɛ   |
|   ER  |   ɜːr |
|   EY  |   eɪ  |
|   IH  |   ɪ   |
|   IY  |   i   |
|   OW  |   oʊ  |
|   OY  |   ɔɪ  |
|   UH  |   ʊ   |
|   UW  |   u   |
"""

CONSONANT_TABLE = """\
|     P     |  p  |
|     S     |  s  |
|     B     |  b  |
|     SH    |  ʃ  |
|     CH    |  tʃ |
|     T     |  t  |
|     D     |  d  |
|     TH    |  θ  |
|     DH    |  ð  |
|     V     |  v  |
|     F     |  f  |
|     W     |  w  |
|     G     |  g  |
|     Y     |  j  |
|     HH    |  h  |
|     Z     |  z  |
|     JH    |  dʒ |
|     ZH    |  ʒ  |
|     K     |  k  |
|     L     |  l  |
|     M     |  m  |
|     N     |  n  |
|     NG    |  ŋ  |
|     R     |  r  |
"""


In [3]:
def init_phonemes(phonemes_table):
    ans = {}
    phonemes_list = list(filter(lambda x: not x in "\n",
                              phonemes_table.replace(" ", "").split("|")))
    for x in range(0, len(phonemes_list), 2):
        ans[phonemes_list[x]] =  phonemes_list[x+1]
    return ans

train_data = read_data(TRAIN_FILE_PATH)

vowels_dict = init_phonemes(VOWELS_TABLE)
consonants_dict = init_phonemes(CONSONANT_TABLE)

In [4]:
def contain_phonemes(phoneme, phonemes_dict):
    return phoneme in phonemes_dict

def count_contain_phonemes_num(phonemes_list, phonemes_dict):
    return len(list(filter(lambda phoneme:contain_phonemes(phoneme, phonemes_dict), phonemes_list)))

def get_world(line):
    return line.split(":")[0]

def has_number(string):
    return any(char.isdigit() for char in string)

def get_stress_index(vowels_list):
    return [i + 1 for i, x in enumerate(vowels_list) if "1" in x][0]

## Decision Tree

In [5]:
def get_data(line):
    line = line.split(":")[1]
    phonemes_list = line.split(" ")
    vowels_list = list(filter(lambda phoneme: has_number(phoneme), phonemes_list))
    consonants_list = list(filter(lambda phoneme: not has_number(phoneme), phonemes_list))
    # print(count_contain_phonemes_num(constants_list, consonants_dict))
    
    vowels_vector = list(vowels_dict)
    for i in range(len(vowels_vector)):
        vowels_vector[i] = 1 if vowels_vector[i] in [vowel[0:2] for vowel in vowels_list] else 0
        
    consonants_vector = list(consonants_dict)
    for i in range(len(consonants_vector)):
        consonants_vector[i] = 1 if consonants_vector[i] in consonants_list else 0
        
    return [get_stress_index(vowels_list), len(vowels_list), len(consonants_list)] + vowels_vector + consonants_vector

In [6]:
df = pd.DataFrame([get_data(line) for line in train_data], 
                  index=[get_world(line) for line in train_data],
                  columns=["stress_index", "vowels_size", "constans_size"] + list(vowels_dict) + list(consonants_dict))
df.head()
# df.describe()

Unnamed: 0,stress_index,vowels_size,constans_size,AA,AE,AH,AO,AW,AY,EH,...,HH,Z,JH,ZH,K,L,M,N,NG,R
COED,1,2,2,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
PURVIEW,1,2,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HEHIR,1,2,3,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
MUSCLING,1,3,4,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
NONPOISONOUS,2,4,6,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [7]:
# df[(df["vowels_size"] == 1) | (df["vowels_size"] == 5)].size

# df.groupby(['vowels_size', 'stress_index']).agg(
#     {'constans_size':'count'}).groupby(level=0).apply(lambda x : 
# x * 100 / float(x.sum())).add_suffix('_Count').reset_index().columns[1]

# df.groupby(['vowels_size', 'stress_index']).agg(
#     {'constans_size':'count'}).groupby(level=0).apply(lambda x :                          
#                                                       x * 100 / float(x.sum())).groupby("vowels_size").plot.pie(subplots=True)

feature_list = list(df.corr().stress_index.to_frame().sort_values(by='stress_index', ascending=False).index)
feature_list

['stress_index',
 'vowels_size',
 'constans_size',
 'IY',
 'AA',
 'OW',
 'AH',
 'N',
 'EH',
 'SH',
 'V',
 'K',
 'UW',
 'R',
 'P',
 'IH',
 'EY',
 'M',
 'T',
 'Y',
 'S',
 'D',
 'ZH',
 'JH',
 'CH',
 'AO',
 'Z',
 'OY',
 'F',
 'L',
 'UH',
 'TH',
 'DH',
 'AE',
 'G',
 'AY',
 'AW',
 'B',
 'NG',
 'W',
 'HH',
 'ER']

In [8]:
def experiment_once(train, test, feature_list, depth=5, log=False):
    x_train = train[feature_list]
    x_test = test[feature_list]
    y_train = train.stress_index
    y_test = test.stress_index

    clf = DecisionTreeClassifier(criterion = "gini", max_depth=depth, random_state=RANDOM_SEED)
    dtree = clf.fit(x_train, y_train)
    train_err = dtree.score(x_train, y_train)
    test_err = dtree.score(x_test, y_test)
    if log:
        print("{}, {}, {}".format(train_err, test_err, train_err - test_err))
    return train_err, test_err

def evalute(feature_list, times=10, depth=5, log=False):
    total_train_err, total_test_err = 0, 0
    for i in range(times):
        train, test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED + i)
        train_err, test_err = experiment_once(train, test, feature_list, depth, log)
        total_train_err = total_train_err + train_err
        total_test_err  = total_test_err  + test_err
    print("avg:")
    print("{}, {}, {}".format(total_train_err / times, total_test_err / times, (total_train_err - total_test_err) / times))

evalute(feature_list[1:20], 10, depth=100, log=True)

0.892725, 0.745, 0.147725
0.8913, 0.7512, 0.1401
0.891725, 0.7452, 0.14652500000000002
0.8917, 0.7486, 0.1431
0.891475, 0.7549, 0.136575
0.891525, 0.7496, 0.14192499999999997
0.89125, 0.7476, 0.14364999999999994
0.8917, 0.7495, 0.1422
0.890725, 0.7554, 0.13532500000000003
0.89315, 0.7461, 0.14705000000000001
avg:
0.8917275, 0.7493100000000001, 0.14241749999999992
