In [1]:
import pandas as pd
import numpy as np

In [33]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
sample_data = pd.read_csv("data/sample.csv")
author_df = pd.DataFrame(train_data)["author"]
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
sample_df = pd.DataFrame(sample_data)

CHAR_ALLOW = [ " "]
ALPHA_ALLOW = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p",
                       "q", "r", "s", "t", "u", "v", "w", "x", "y","z", "ö"]
DF = train_df[0:500]
TEST_DF = train_df[501:600]

In [3]:
def process_char(char):
    char = char.lower()
    if char.isnumeric():
        return " "
    if char.isalpha():
        if char in ALPHA_ALLOW:
            return char
        else: return "ö"
    else: return " "

def process_text(text):
    cleaned = ""
    for c in text:
        cleaned += process_char(c)
    return cleaned

In [9]:
WORDS = set()
for i, r in DF.iterrows():
    new = process_text(r["text"])
    new = new.split(" ")
    WORDS.update(new)
WORDS = list(WORDS)
WORD_TO_IDX = {}
IDX_TO_WORD = {}
for index, word in enumerate(WORDS):
    WORD_TO_IDX[word] = index
    IDX_TO_WORD[index] = word
LEN_WORDS = len(WORDS)

In [19]:
def transition_matrix(text):
    t = np.zeros((LEN_WORDS, LEN_WORDS))
    text = text.split(" ")
    for i in range(len(text)-1):
        curr_word = text[i]
        curr_idx = WORD_TO_IDX[curr_word]
        next_word = text[i+1]
        next_idx = WORD_TO_IDX[next_word]
        t[curr_idx][next_idx] += 1
    
    row_sums = np.sum(t, 1)
    
    for i in range(LEN_WORDS):
        row_sum = row_sums[i]
        if(row_sum == 0):
            row_sum = 1
        t[i, :] = t[i, :]/row_sum
    
    return t

In [22]:
def transition_matrix_by_author(df):
    text_by_author = {}
    for index, row in df.iterrows():
        author = row["author"]
        text = process_text(row["text"])
        if author not in text_by_author.keys():
            text_by_author[author] = ""
        text_by_author[author] = text_by_author[author] + text
    data = []
    for author in text_by_author.keys():
        d = {"author": author, "text": text_by_author[author], "transition_matrix": transition_matrix(text_by_author[author])}
        data.append(d)
    return data

In [31]:
def log_likelihood(matrix, text):
    text = process_text(text).split(" ")
    len_text = len(text) 
    log_likelihood = np.zeros(0)
    for i in range(0, len(text)-1):
        current_word = text[i]
        next_word = text[i+1]
        step_prob = 0
        if current_word in WORD_TO_IDX.keys() and next_word in WORD_TO_IDX.keys():
            step_prob = matrix[WORD_TO_IDX[current_word] , WORD_TO_IDX[next_word]]    
        log_likelihood = np.append(log_likelihood, step_prob) 

    log_likelihood = np.log(log_likelihood)
    likelihood_neglect_special_case = 0
    inf_count = 0

    for i in range(len(log_likelihood)):
        if (log_likelihood[i]!= float("-inf")):
            likelihood_neglect_special_case = likelihood_neglect_special_case+log_likelihood[i] 
        else:
            inf_count = inf_count+1 

    log_likelihood_acc = np.where(log_likelihood == float("-inf"), 0,  log_likelihood)
    log_likelihood_acc = np.cumsum(log_likelihood_acc)
    return likelihood_neglect_special_case/(len_text-inf_count), log_likelihood_acc

In [32]:
def predict_author(text, matrices):
    min_val = -10000000000
    author = ""
    for d in data:
        t = d["transition_matrix"]
        likelikhood = log_likelihood(t, text)[1]
        if likelihood > min_val:
            min_val = likelihood
            author = d["author"]
            
    return {"author": author, "likelihood": min_val}
    

def test(df, matrices):
    count = len(df.index)
    correct = 0
    for i, r in df.iterrows():
        prediction = predict_author(r["text"], matrices)
        if prediction == r["author"]:
            count += 1
    accuracy = correct/count
    print(accuracy)
        
data = transition_matrix_by_author(DF)
print(test(TEST_DF, data))


author:  EAP , log likelihood:  (-1.6700258686722194, array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -1.88569129, -1.88569129, -1.88569129, -1.88569129, -1.88569129,
       -1.88569129, -1.88569129, -1.88569129, -6.16927785, -6.16927785,
       -6.68010347, -6.68010347, -6.68010347, -6.68010347, -6.68010347,
       -6.68010347]))
author:  HPL , log likelihood:  (-2.0317292866564696, array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.   

  del sys.path[0]
