In [1]:
import numpy as np
import pandas as pd
import string
import gzip

## Data preprocessing

In [2]:
data = pd.read_csv('data/ukrainian_reviews.csv', sep='|')
del data['opinion_rating']

In [3]:
data.head()

Unnamed: 0,opinion_text
0,"хороше місце щоб провести вечір, кальяни смачн..."
1,взуттям задоволена. беру не перший раз. мінус...
2,"відмінний магазин та чемний продавець, єдиним ..."
3,"замовляв дві штуки, а відправили 1"
4,замовляю вже втретє в цьому магазині і планую ...


In [4]:
data.to_csv('data/data.txt', sep='\t', index=False, header=False)

In [5]:
def load_data(path):
    df = []
    for line in gzip.open(path, 'rb'):
        df.append(eval(line))
    return pd.DataFrame.from_dict(df)

In [6]:
df = load_data('data/qa_Appliances.json.gz')
df = df['answer']

In [7]:
df.to_csv('data/qa_data.txt', sep='\t', index=False, header=False)

In [8]:
# training_data_file = 'data/data.txt'
training_data_file = 'data/qa_data.txt'

## Training

### Helper functions

In [9]:
def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('','', string.punctuation))

In [10]:
def add2dict(dictionary, key, value):
    if key not in dictionary:
        dictionary[key] = []
    dictionary[key].append(value)

In [11]:
def list2probabilitydict(given_list):
    probability_dict = {}
    given_list_length = len(given_list)
    for item in given_list:
        probability_dict[item] = probability_dict.get(item, 0) + 1
    for key, value in probability_dict.items():
        probability_dict[key] = value / given_list_length
    return probability_dict

In [12]:
initial_word = {}
second_word = {}
transitions = {}

### Training function

In [13]:
# Trains a Markov model based on the data in training_data_file
def train_markov_model():
    for line in open(training_data_file, encoding='utf-8'):
        tokens = remove_punctuation(line.rstrip().lower()).split()
        tokens_length = len(tokens)
        for i in range(tokens_length):
            token = tokens[i]
            if i == 0:
                initial_word[token] = initial_word.get(token, 0) + 1
            else:
                prev_token = tokens[i - 1]
                if i == tokens_length - 1:
                    add2dict(transitions, (prev_token, token), 'END')
                if i == 1:
                    add2dict(second_word, prev_token, token)
                else:
                    prev_prev_token = tokens[i - 2]
                    add2dict(transitions, (prev_prev_token, prev_token), token)
    
    # Normalize the distributions
    initial_word_total = sum(initial_word.values())
    for key, value in initial_word.items():
        initial_word[key] = value / initial_word_total
        
    for prev_word, next_word_list in second_word.items():
        second_word[prev_word] = list2probabilitydict(next_word_list)
        
    for word_pair, next_word_list in transitions.items():
        transitions[word_pair] = list2probabilitydict(next_word_list)
    
    print('Training successful.')

In [14]:
train_markov_model()

Training successful.


## Testing

### Helper functions

In [15]:
def sample_word(dictionary):
    p0 = np.random.random()
    cumulative = 0
    for key, value in dictionary.items():
        cumulative += value
        if p0 < cumulative:
            return key
    assert(False)

### Test functions

In [16]:
number_of_sentences = 10

In [17]:
# Function to generate sample text
def generate():
    for i in range(number_of_sentences):
        sentence = []
        # Initial word
        word0 = sample_word(initial_word)
        sentence.append(word0)
        # Second word
        word1 = sample_word(second_word[word0])
        sentence.append(word1)
        # Subsequent words untill END
        while True:
            word2 = sample_word(transitions[(word0, word1)])
            if word2 == 'END':
                break
            sentence.append(word2)
            word0 = word1
            word1 = word2
        print(i, ' '.join(sentence))

### Testing arena

In [18]:
generate()

0 i have cooked with it also hope this helpedi love this machine i would say id you want it to replace my filter in our water and use a weak bleachwater solution wipe all services that we love out parts we have done for both which units do u turn on small motor keeps a constant real time update on temp and a few months of quiet operation its beginning to get temperatures right it doesnt accumulate too much i always try to repair or replace parts before we replaced it once and didnt replace the entire laminate material is placed on top of the biggest that will listen how much grease you generate its safe to drink with use of original mounting holes are different it does yes see httpwwwappliancepartsproscomgerackasmlowerwd28x10284ap4980665html i bought them for connecting the water supply line on httpwwwappliancepartsproscomwhirlpoolwaterfilterhousing2186443ap3085317html takes you to use and still like new amazon makes a converter on the back of the paneldrum light fresh hold steam for s