# 1. Data Exploration

### Information on the Dataset

Data Fields for SNOW T15 and SNOW T23 ⛄<br>
Resource: https://huggingface.co/datasets/snow_simplified_japanese_corpus <br>
Paper: https://aclanthology.org/L18-1072.pdf

- <strong>ID</strong>: sentence ID.
- <strong>original_ja</strong>: original Japanese sentebolnce.
- <strong>simplified_ja</strong>: simplified Japanese sentence.
- <strong>original_en</strong>: original English sentence.
- <strong>proper_noun</strong>: (included ONLY in SNOW T23) Proper nowus that the workers has extracted as proper nouns. The authors instructed workers not to rewrite proper nouns, leaving the determination of proper nouns to the workers.

# 2. Baseline Model

In the SNOW T15 dataset it states: <br>
<i>Core vocabulary is restricted to 2,000 words where it is selected by accounting for several factors such as meaning preservation, variation, simplicity and the UniDic word segmentation criterion/</i>

#### Step 1: Take a sample size from the SNOW T15 dataset and extracted 2,000 simplified terms.

In [1]:
""" Required installations """

!pip install mecab-python3
#These wheels include a copy of the MeCab library, but not a dictionary. 
#In order to use MeCab you'll need to install a dictionary. unidic-lite is a good one to start with:
!pip install unidic-lite

# normalization tool
!pip install neologdn

!pip install openpyxl

# To be able to see in Japanese!
!pip install japanize_matplotlib



In [2]:
import os
import pandas as pd
import numpy as np

# Preprocessing
import MeCab
import neologdn
import collections
from nltk import FreqDist
from nltk.corpus import stopwords

import time
import logging
import collections
import logging
import time
from gensim.models.word2vec import Word2Vec
logging.basicConfig()
logging.root.setLevel(logging.INFO)

# Visualization
import matplotlib.pyplot as plt
import japanize_matplotlib


In [3]:
def get_data(file):
    """
    Gets csv data under 'simply-japanese/data/'
    Returns as Dataframe where columns=['original','simplified']
    """

    # FIXME:  Make sure to
    # 1. Change these when you transfer to .py file
    # 2. Put these global variables somewhere else
    
    CURRENT_PATH = 'notebooks/Untitled.ipynb'
    DATA_PATH = 'data/2_RawData'
    csv_path = os.path.abspath(__file__)[:-len(CURRENT_PATH)]  + DATA_PATH
    df = pd.read_excel(os.path.join(csv_path, file))
    
    df.drop(columns=['#英語(原文)','#固有名詞'], inplace=True, errors='ignore')
    df.rename(columns={"#日本語(原文)": "original", "#やさしい日本語": "simplified"}, inplace=True)
    
    return df

In [22]:
# FIXME: Set df in __init__ 
def term_frequency(df, col='original'):
    """
    Count number of terms in a corpus
    Ignore independent words  ["助動詞", "助詞", "補助記号"] and words in japanese stopwords
    Returns collection of term and its frequency
    """
    # FIXME : Need to find a way to implement japanese_stopword.txt when this file is used externally
    jp_stopwords = stopwords.words('japanese')
    all_terms = collections.Counter()
    t = MeCab.Tagger("-O wakati")
    for idx, row in df.iterrows():
        text = row[col]
        node = t.parseToNode(text).next
        while node.next:
            part_of_speech = node.feature.split(',')[0]
            # TBD
            if part_of_speech in ["助動詞", "助詞", "補助記号"] or node.surface in jp_stopwords:
                node = node.next
                continue
            all_terms[node.surface] += 1
            node = node.next
    return all_terms

In [8]:
def get_simplified_terms(df, n_most_common):
    """
    Only returns simplified terms that exists in the simplified column
    Return list until the top 'n' elements from most common
    """
    # Filter out corpuses if original and simplified are exactly the same
    diff_corpus_df = df[df['original'] != df['simplified']]
    
    # Create collections of original and simplified terms
    original_terms = term_frequency(diff_corpus_df, 'original')
    simplified_terms = term_frequency(diff_corpus_df, 'simplified')
    
    # Compare two collections using subtract
    diff_terms = simplified_terms
    diff_terms.subtract(original_terms)
    
    diff_terms_df = pd.DataFrame(dict(diff_terms).items(), columns=['word', 'count'])
    return diff_terms_df[diff_terms_df['count'] >= 0].sort_values(by='count', ascending=False)['word'].tolist()[:n_most_common]

In [23]:
df = get_data('SNOW_T15_10000.xlsx')
len(get_simplified_terms(df, 2000))

1780

#### Step 2: Using the 2000 list of simplified terms from Step 1, find the nearest term

In [None]:
"""
tf_list = 2000 simplified term frequency retrieved from data
pos_list = specified list of POS (Parts-Of-Speech)

1. Go through each row in the original data
2. Check it word is in the pos_list
3. Check if a word is in the tf_list
    if yes, continue to the next word
4. If 2. is no: check the similarity of the word with all the tf_list
5. Replace the word with maximum value and if the maximium exceeds a specified threshold

"""

"""
PSEUDO CODE

threshold = minimum similarity
for sentence in data:
    for word in sentence:
        if word.pos in pos_list:
            if word in tf_list:
                continue
            else:
                for tf in tf_list:
                    list = []
                    list.append(wv.similarity(word, tf))
                replace word with max(list) if max(list) > threshold
        else: continue
"""

In [None]:
def replace_terms(data, term_list=term_list, wv=wv):
    """
    1. Identify every POS in a sentence and if it should be replaced
    2. Use the pre-trained Word2Vec model to get a term from term_list with closest distance to POS
    3. Replace POS in sentence
    4. Add new sentence to dataframe in column "prediction"
    
    input:
    data, np.series
    term_list, list of simplified terms
    wv, word2vec model.wv
    
    output:
    prediction, np.series
    """
    logging.root.setLevel(logging.INFO)
    
    start = time.time()
    # Make sure the data is a series, not a df or list
    try:
        assert type(data) == pd.core.series.Series
        logging.info("Data file type OK")
    except:
        print("Data file type is NOT a pd.series")
    
    pos_list = ("名詞", "動詞", "代名詞") # POS (part of speech) that will possibly be removed
    threshold = 0.5 # Threshold of similarity, over which a term will be replaced
    t = MeCab.Tagger()
    numbers_dict = {
        "0":"零",
        "1":"一",
        "2":"二",
        "3":"三",
        "4":"四",
        "5":"五",
        "6":"六",
        "7":"七",
        "8":"八",
        "9":"九"
    }
    counter = collections.Counter()
    prediction = data.copy()
    assert len(prediction) == len(data)     # Make sure prediction and data have the same size
    
    # Iterate over every sentence in the dataset
    for idx, row in data.items():
        row = neologdn.normalize(row)
        logging.debug(f"Currrent sentence: {row}")
        numerals = sum(c.isdigit() for c in row)
        if numerals > 0:
            for i in range(numerals):
                for entry in numbers_dict:
                    row = row.replace(entry, numbers_dict[entry])
        sentence = []
        # Iterate over every word in the sentence
        node = t.parseToNode(row).next
        while node.next:
            word = node.feature.split(',')[8]
            part_of_speech = node.feature.split(',')[0]
            # If POS is not noun, pronoun or verb: add word to list and continue
            if part_of_speech not in pos_list:
                sentence.append(word)
            else:
                # If the term is already in the term list: do not replace, add word to list and continue
                if word in term_list:
                    sentence.append(word)
                else:
                    # Replace word with closest word from term list
                    try:
                        if wv.most_similar(word)[0][1] > threshold:
                            closest_word = wv.most_similar(word)[0][0]
                            sentence.append(closest_word)
                        else:
                            sentence.append(word)
                    except KeyError as e:
                        sentence.append(word)
                        logging.warning(f"{e}. Term will not be replaced.")
            counter[node.surface] += 1
            node = node.next
        logging.debug(sentence)
        prediction[idx] = "".join(sentence)
    
    assert len(data) == len(prediction)  # Make sure prediction and data have the same size
    end = time.time()
    logging.info(end-start)
    return prediction
            
            
predictions = replace_terms(X150)

# 3. Evaluation Metric

### 3.1) WER SCORE

In [None]:
def wer_score(predicted, simplified, debug=True):
    '''
    Compares the simplified ML prediction of a given text to the pre-existing simplified
    text given with the dataframe.
    Using the WER (word error rate) algorithm.
    Adds the WER score as a new column to the Dataframe
    '''
    r = predicted.split()
    h = simplified.split()
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3
    DEL_PENALTY = 1
    INS_PENALTY = 1
    SUB_PENALTY = 1
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS
    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1
                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL
    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        lines = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("OK\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("SUB\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("DEL\t" + r[i]+"\t"+"****")
    return (numSub + numDel + numIns) / (float) (len(r))
    wer_result = round( (numSub + numDel + numIns) / (float) (len(r)), 3)
    
    
def wer_jp(original, simplified):
    ori = ''
    simpi = ''
    wer_score = []
    for i in original:
        ori += i + ' '
    for i in simplified:
        simpi += i + ' '
    print('WER Score ', round(wer(ori, simpi),3))
    return round(wer(ori, simpi),3)
    return wer_score


def evaluate_wer_score(df):
    wer_list = []
    for i in df.index:
        original_text = df.iloc[i][0]
        simplified_text = df.iloc[i][1]
        wer_list.append(wer_jp(original_text, simplified_text))
    df['WER_score'] = wer_list

# 3. Preprocessing

# 3.1) Data Organization and Clean Up!

In [None]:
# All the imported libraries go here for Section 2