In [20]:
import numpy as np
import pandas as pd
import string
import gzip
import re

## Data preprocessing

In [21]:
# data = pd.read_csv('data/ukrainian_reviews.csv', sep='|')
# del data['opinion_rating']

In [22]:
# data.head()

In [23]:
# data.to_csv('data/data.txt', sep='\t', index=False, header=False)

In [24]:
def load_data(path):
    df = []
    for line in gzip.open(path, 'rb'):
        df.append(eval(line))
    return pd.DataFrame.from_dict(df)

In [25]:
df = load_data('data/qa_Appliances.json.gz')
df = df['question'] + ' ' + df['answer']

In [26]:
df.to_csv('data/qa_data.txt', sep='\t', index=False, header=False)

In [27]:
# training_data_file = 'data/ua_data.txt'
training_data_file = 'data/qa_data.txt'

## Training

### Helper functions

In [28]:
def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('','', string.punctuation))

In [29]:
def add2dict(dictionary, key, value):
    if key not in dictionary:
        dictionary[key] = []
    dictionary[key].append(value)

In [30]:
def list2probabilitydict(given_list):
    probability_dict = {}
    given_list_length = len(given_list)
    for item in given_list:
        probability_dict[item] = probability_dict.get(item, 0) + 1
    for key, value in probability_dict.items():
        probability_dict[key] = value / given_list_length
    return probability_dict

In [31]:
initial_word = {}
second_word = {}
transitions = {}

### Training function

In [32]:
# Trains a Markov model based on the data in training_data_file
def train_markov_model():
    for line in open(training_data_file):
        tokens = remove_punctuation(line.rstrip().lower()).split()
        tokens_length = len(tokens)
        for i in range(tokens_length):
            token = tokens[i]
            if i == 0:
                initial_word[token] = initial_word.get(token, 0) + 1
            else:
                prev_token = tokens[i - 1]
                if i == tokens_length - 1:
                    add2dict(transitions, (prev_token, token), 'END')
                if i == 1:
                    add2dict(second_word, prev_token, token)
                else:
                    prev_prev_token = tokens[i - 2]
                    add2dict(transitions, (prev_prev_token, prev_token), token)
    
    # Normalize the distributions
    initial_word_total = sum(initial_word.values())
    for key, value in initial_word.items():
        initial_word[key] = value / initial_word_total
        
    for prev_word, next_word_list in second_word.items():
        second_word[prev_word] = list2probabilitydict(next_word_list)
        
    for word_pair, next_word_list in transitions.items():
        transitions[word_pair] = list2probabilitydict(next_word_list)
    
    print('Training successful.')

In [33]:
# # Trains a Markov model based on the data in training_data_file
# i = 0
# for line in open(training_data_file, encoding='utf-8'):
#     if i != 5:
#         tokens = remove_punctuation(line.rstrip().lower()).split()
#         tokens_length = len(tokens)
#         for i in range(tokens_length):
#             token = tokens[i]
#             if len(token) < 13 or (len(token)<2 and (token not in ['a', 'i'])):
#                 if i == 0:
#                     initial_word[token] = initial_word.get(token, 0) + 1
#                 else:
#                     prev_token = tokens[i - 1]
#                     if i == tokens_length - 1:
#                         if len(prev_token) < 2:
#                             print(prev_token)
#                         add2dict(transitions, (prev_token, token), 'END')
#                     if i == 1:
#                         add2dict(second_word, prev_token, token)
#                     else:
#                         prev_prev_token = tokens[i - 2]
#                         add2dict(transitions, (prev_prev_token, prev_token), token)
#             else:
#                 pass
#         i += 1
#     else:
#         break

# # Normalize the distributions
# initial_word_total = sum(initial_word.values())
# for key, value in initial_word.items():
#     initial_word[key] = value / initial_word_total

# for prev_word, next_word_list in second_word.items():
#     second_word[prev_word] = list2probabilitydict(next_word_list)

# for word_pair, next_word_list in transitions.items():
#     transitions[word_pair] = list2probabilitydict(next_word_list)

# print('Training successful.')

In [34]:
train_markov_model()

Training successful.


## Testing

### Helper functions

In [35]:
def sample_word(dictionary):
    p0 = np.random.random()
    cumulative = 0
    for key, value in dictionary.items():
        cumulative += value
        if p0 < cumulative:
            return key
    assert(False)

### Test functions

In [39]:
number_of_sentences = 2

In [42]:
# Function to generate sample text
def generate():
    for i in range(number_of_sentences):
        sentence = []
        # Initial word
        word0 = sample_word(initial_word)
        sentence.append(word0)
        # Second word
        word1 = sample_word(second_word[word0])
        sentence.append(word1)
        # Subsequent words untill END
        while True:
            word2 = sample_word(transitions[(word0, word1)])
            if word2 == 'END':
                break
            sentence.append(word2)
            word0 = word1
            word1 = word2
            print(word2)
        print(i, ' '.join(sentence))

### Testing arena

In [43]:
generate()

a
problem
may
be
adequate
0 ive noticed a problem may be adequate
work
whirlpool
w10130913
water
pump
3363394
3348015
the
one
star
reviews
the
basket
3
14
opening
thanks
this
whole
unit
rodney
the
ice
bin
in
less
than
6
months
1 will this work whirlpool w10130913 water pump 3363394 3348015 the one star reviews the basket 3 14 opening thanks this whole unit rodney the ice bin in less than 6 months
