In [1]:
import pandas as pd
pd.options.mode.copy_on_write = True
import matplotlib.pyplot as plt
import seaborn as sns


import sys
sys.path.append('..')
import src.data.util as du
import src.text.tokenizer as tk

# Data setup

In [2]:
df = pd.read_csv('../results/complexity_utf-16_4243_20_10_bibles_90_lcm.csv', index_col=False)
bibles = pd.read_csv('../dataset/bibles_90_lcm.csv')

raw_gzip = df[(df.metric == "do-nothing") & (df.algorithm == "gzip")]
raw_bz2 = df[(df.metric == "do-nothing") & (df.algorithm == "bz2")]

In [3]:
df.head()

Unnamed: 0,language,wals,metric,algorithm,value,run_id
0,ANCIENT_GREEK,[grc],del-verses,gzip,0.807393,0
1,ANCIENT_GREEK,[grc],del-verses,gzip,0.809476,1
2,ANCIENT_GREEK,[grc],del-verses,gzip,0.804954,2
3,ANCIENT_GREEK,[grc],del-verses,gzip,0.809315,3
4,ANCIENT_GREEK,[grc],del-verses,gzip,0.811832,4


# Helper functions

In [4]:
def get_indexes(df):
    langs = df.drop_duplicates("language").sort_values("value").language
    ret = {
        col : i
    for i, col in enumerate(langs)
    }
    return ret


def index_difference(id1, id2):
    ids = id1.keys() & id2.keys()
    ret = {
        i : (id1[i] - id2[i])
        for i in ids
    }
    return ret


def compute_numtypes_numtokens(df):
    langs = du.by_field(df, 'language')
    united = {
        lang : du.df_to_str(val)
        for lang, val in langs.items()
    }

    d = dict(language=[], tokens=[], types=[])
    for lang, text in united.items():
        d['language'].append(lang)
        tokens = tk.tokens(text)
        d['tokens'].append(len(tokens))
        d['types'].append(len(tk.types(tokens)))
    return d
       

def fetch_algorithm(df, metric, algo):
    df2 = df[(df.algorithm == algo) & (df.metric == metric)]
    by = ["language"]
    m = df2.groupby(by=by).value.transform("mean")
    s = df2.groupby(by=by).value.transform("std")
    df2['mean'] = m
    df2['std'] = s
    return df2.drop_duplicates('language').sort_values('language').reset_index().drop(columns='index')

# H1'
The language complexity of a translated text should be greater than their counterpart in the source language.

In [5]:
raw_gzip[raw_gzip.value == raw_gzip.value.min()].drop_duplicates("language")

Unnamed: 0,language,wals,metric,algorithm,value,run_id
3410,NHEENGATU,[yrl],do-nothing,gzip,281693.0,0


In [6]:
raw_bz2[raw_bz2.value == raw_bz2.value.min()].drop_duplicates("language")

Unnamed: 0,language,wals,metric,algorithm,value,run_id
3680,NHEENGATU,[yrl],do-nothing,bz2,171128.0,0


# H2'
a) There exists a **positive** correlation between morphological complexity and **the number of types** in a sample.

b) There exists a **negative** correlation between morphological complexity and **the number of tokens** in a sample.

In [7]:
d = compute_numtypes_numtokens(bibles)

tdf = pd.DataFrame(d).sort_values('language')

repwords_gzip = fetch_algorithm(df, 'rep-words', 'gzip')
repwords_bz2 = fetch_algorithm(df, 'rep-words', 'bz2')

print("Gzip results:")
print("a) %0.4f" % tdf.types.corr(repwords_gzip.value, method='pearson'))
print("b) %0.4f" % tdf.tokens.corr(repwords_gzip.value, method='pearson'))

print("\nBz2 results:")
print("a) %0.4f" % tdf.types.corr(repwords_bz2.value, method='pearson'))
print("b) %0.4f" % tdf.tokens.corr(repwords_bz2.value, method='pearson'))

Gzip results:
a) 0.9171
b) -0.7652

Bz2 results:
a) 0.8632
b) -0.7748


# H3'
The results are equivalent wheter using **Gzip** or **Bz2**.

In [8]:
gzipi = get_indexes(raw_gzip)
bz2i = get_indexes(raw_bz2)

gzip_series = pd.Series([gzipi[x] for x in sorted(gzipi) ])
bz2_series = pd.Series([bz2i[x] for x in sorted(bz2i)])

print("Correlation between Gzip and Bz2 is %0.4f" % gzip_series.corr(bz2_series, method='pearson'))

Correlation between Gzip and Bz2 is 0.9896


# H4'

In [9]:
def deviation_across_languages(df):
    df2 = df.loc[:, :]
    by = ["metric", "algorithm"]
    m = df.groupby(by=by).value.transform("mean")
    s = df.groupby(by=by).value.transform("std")
    df2['mean'] = m
    df2['std'] = s
    return df2.sort_values('language').reset_index().drop(columns='index')

In [10]:
daldf = deviation_across_languages(df)

In [11]:
do_nothing_none = daldf['do-nothing', 'none')
do_nothing_gzip = fetch_algorithm(df, 'do-nothing', 'gzip')
do_nothing_bz2  = fetch_algorithm(df, 'do-nothing', 'bz2')

SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (2945950908.py, line 1)