## Calculating mutual information and shannon entropy


In [1]:
import pandas as pd
import numpy as np
import os
import glob

explode=True
all_registers=True

In [2]:
def read_data(path):

    # Initialize an empty list to store DataFrames
    dfs = []

    # Walk through the directory
    for subdir, _, _ in os.walk(path):
        # Find all .tsv files in the current directory
        for file in glob.glob(os.path.join(subdir, '*.tsv')):
            # Read the file into a DataFrame
            df = pd.read_csv(file, sep='\t')
            print(f'Read {file} succesfully.', flush=True)
            # Append the DataFrame to the list
            dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.reset_index(drop=True, inplace=True)

    # Now `combined_df` contains data from all .tsv files
    print("All data read.", flush=True)
    return combined_df

#df = pd.read_csv("/scratch/project_2009199/register-vs-genre/results/en_1/020824_large_multiL-large_03_04.tsv", sep='\t')
df = read_data("/scratch/project_2009199/register-vs-genre/results/")

labels_all_hierarchy_with_other = {
    #"MT": ["MT"],
    "LY": ["LY"],
    "SP": ["SP", "it", "os"],
    "ID": ["ID"],
    "NA": ["NA", "ne", "sr", "nb", "on"],
    "HI": ["HI", "re", "oh"],
    "IN": ["IN", "en", "ra", "dtp", "fi", "lt", "oi"],
    "OP": ["OP", "rv", "ob", "rs", "av", "oo"],
    "IP": ["IP", "ds", "ed", "oe"],
}
reverse_lookup = {}
for main_label, sublabels in labels_all_hierarchy_with_other.items():
    for sublabel in sublabels:
        reverse_lookup[sublabel] = main_label



def separate_sub_labels_from_incorrect_main_labels(df):
    new_sublabels = []
    for index, d in df.iterrows():
        #print(type(d["subregister_prediction"]))
        #print(d["register_prediction"], d["original_register"])
        if type(d["subregister_prediction"]) == str:
            if d["subregister_prediction"] not in labels_all_hierarchy_with_other[d["register_prediction"]]:
                new_sublabels.append(np.nan)   # remove erronious => makes LY ID (and MT) be dropped!
                continue
                #e.g. NA + ob can be an error OR from NA, OP, ob
                #print(reverse_lookup[d["subregister_prediction"]], " in ", d["original_register"])
                if reverse_lookup[d["subregister_prediction"]] in d["original_register"]:    
                    # which case; is this from NA OP ob, if is, remove ob from NA row
                    #d["subregister_prediction"] = np.nan     # modify by removing, else keep erronious NA+ob
                    #df.at[index, "subregister_prediction"] = np.nan
                    new_sublabels.append(np.nan)
                else:
                    new_sublabels.append(d["subregister_prediction"])
            else:
                new_sublabels.append(d["subregister_prediction"])
        else:
                new_sublabels.append(d["subregister_prediction"])
    df["subregister_prediction"] = new_sublabels

    return df


df["original_register"] = df["register_prediction"].apply(lambda x: eval(x))     ### to check later if label like ["NA", "OP", "ob"] is divided correctly
df["subregister_prediction"] = df["register_prediction"].apply(lambda x:  [i for i in eval(x) if i.islower()])
df["register_prediction"] = df["register_prediction"].apply(lambda x: [i for i in eval(x) if i in labels_all_hierarchy_with_other.keys()])
df["genre_prediction"] = df["genre_prediction"].apply(lambda x: eval(x))
if all_registers:
    df = df.explode("register_prediction").explode("subregister_prediction")
    df = df[df['register_prediction'].notna()]
    print(df[["register_prediction","subregister_prediction","original_register"]].head(n = 20))
    df = separate_sub_labels_from_incorrect_main_labels(df)
    print(df[["register_prediction","subregister_prediction","original_register"]].head(n = 20))
    df = df.explode("genre_prediction")
else:    
    df["register_prediction"] = df["register_prediction"].apply(lambda x: [i for i in eval(x) if i in labels_all_hierarchy_with_other.keys()])
#df["genre_prediction"] = df["genre_prediction"].apply(lambda x: eval(x))

Read /scratch/project_2009199/register-vs-genre/results/en_7/020824_large_multiL-large_03_04.tsv succesfully.
Read /scratch/project_2009199/register-vs-genre/results/en_9/020824_large_multiL-large_03_04.tsv succesfully.
Read /scratch/project_2009199/register-vs-genre/results/en_5/020824_large_multiL-large_03_04.tsv succesfully.
Read /scratch/project_2009199/register-vs-genre/results/en_8/020824_large_multiL-large_03_04.tsv succesfully.
Read /scratch/project_2009199/register-vs-genre/results/en_2/020824_large_multiL-large_03_04.tsv succesfully.
Read /scratch/project_2009199/register-vs-genre/results/en_3/020824_large_multiL-large_03_04.tsv succesfully.
Read /scratch/project_2009199/register-vs-genre/results/en_6/020824_large_multiL-large_03_04.tsv succesfully.
Read /scratch/project_2009199/register-vs-genre/results/en_1/020824_large_multiL-large_03_04.tsv succesfully.
Read /scratch/project_2009199/register-vs-genre/results/en_4/020824_large_multiL-large_03_04.tsv succesfully.
All data r

In [3]:

#df["full_register_prediction"] = df['register_prediction'].fillna('') + '-' + df['subregister_prediction'].fillna('')

df['combined'] = df.apply(
    lambda row: f"{row['register_prediction']}-{row['subregister_prediction']}" if pd.notna(row['register_prediction']) and pd.notna(row['subregister_prediction'])
    else row['register_prediction'] if pd.notna(row['register_prediction'])
    else row['subregister_prediction'] if pd.notna(row['subregister_prediction'])
    else np.nan,
    axis=1
)
display(df)
#if explode:
#    df = df.explode("register_prediction").explode("genre_prediction").dropna()
#else: #drop multilabels
#    df = df[df['register_prediction'].apply(lambda x: len(x) == 1)]
#    df = df[df['genre_prediction'].apply(lambda x: len(x) == 1)]
#    df = df.explode("register_prediction").explode("genre_prediction").dropna()   # this flattens! does not add any rows here since we did the two rows above!
#display(df)

Unnamed: 0.1,Unnamed: 0,id,labels,text,register_prediction,genre_prediction,original_register,subregister_prediction,combined
0,0,33159231,['OP'],ATC Operations Los Angeles puts you in charge ...,IN,Literature & Fiction,"[IN, OP]",,IN
0,0,33159231,['OP'],ATC Operations Los Angeles puts you in charge ...,OP,Literature & Fiction,"[IN, OP]",,OP
1,1,33159278,['NA'],Moving level With Juan Manoel Fangio on FIVE W...,,Engineering & Transportation,"[NA, sr]",sr,NA-sr
2,2,33159452,[],Zombie Bitcoin Defense Free Download PC Game s...,IN,Literature & Fiction,"[IN, IP, ds]",,IN
2,2,33159452,[],Zombie Bitcoin Defense Free Download PC Game s...,IN,Politics & Social Sciences,"[IN, IP, ds]",,IN
...,...,...,...,...,...,...,...,...,...
1001584,111641,21956818,['IN'],The Spoke NYC gallery is known for its strikin...,,Literature & Fiction,"[NA, IN, dtp]",,
1001584,111641,21956818,['IN'],The Spoke NYC gallery is known for its strikin...,IN,Literature & Fiction,"[NA, IN, dtp]",dtp,IN-dtp
1001585,111642,21956886,['IN'],At BABEENI our excellent customer service alwa...,IN,,"[IN, IP, ds]",,IN
1001585,111642,21956886,['IN'],At BABEENI our excellent customer service alwa...,IP,,"[IN, IP, ds]",ds,IP-ds


In [4]:
def cooccurrence_matrix(df):
    regs = np.unique(df["combined"]).tolist()
    gens = np.unique(df["genre_prediction"]).tolist()
    print("regs: ", regs)
    print("gens: ", gens)

    print(range(len(regs)))
    reg2index = {k:v for k,v in zip(regs, range(len(regs)))}
    gen2index = {k:v for k,v in zip(gens, range(len(gens)))}

    m = np.zeros((len(regs), len(gens)), dtype=np.float32)
    sum = 0
    for i,d in df.iterrows():
        r = d["combined"]
        g = d["genre_prediction"]
        
        m[reg2index[r],gen2index[g]] += 1
        sum+=1

    return m/sum


m = cooccurrence_matrix(df)

assert 0.999 < sum(m.sum(axis=0)) < 1.001 and 0.999 < sum(m.sum(axis=1)) < 1.001
            

regs:  ['HI', 'HI-re', 'ID', 'IN', 'IN-dtp', 'IN-en', 'IN-fi', 'IN-lt', 'IN-ra', 'IP', 'IP-ds', 'LY', 'NA', 'NA-nb', 'NA-ne', 'NA-sr', 'OP', 'OP-av', 'OP-ob', 'OP-rs', 'OP-rv', 'SP', 'SP-it']
gens:  ['Cookbooks, Food & Wine', 'Engineering & Transportation', 'Literature & Fiction', 'Medicine & Health Sciences', 'None', 'Politics & Social Sciences', 'Science & Math']
range(0, 23)


In [5]:
print(sum(m.sum(axis=0)))
print(sum(m.sum(axis=1)))

1.0000000000000002
0.9999999999999998


In [9]:
def entropy(m):
    p_gen = m.sum(axis=0)
    p_reg = m.sum(axis=1)

    def _entropy_1d(v):
        return -1*sum(v*np.log2(v))

    def _entropy_2d(m):
        ent = 0
        for j in m:
            for k in j:
                if k != 0:
                    ent += k*np.log2(k)
        return -1*ent

    ent_reg = _entropy_1d(p_reg)
    ent_gen = _entropy_1d(p_gen)
    ent_reg_gen = _entropy_2d(m)

    return ent_reg, ent_gen, ent_reg_gen


def mutual_info(m):
    reg_pdf = m.sum(axis=1)
    gen_pdf = m.sum(axis=0)

    info = 0
    for i in range(len(reg_pdf)):
        for j in range(len(gen_pdf)):
            if m[i,j]!= 0:
                info += m[i,j]*np.log(m[i,j]/(reg_pdf[i]*gen_pdf[j]))
    return info
            
    
    
#print(entropy(m))
print(mutual_info(m))

0.10867582104214793


In [10]:
# same with sklearn (easier to explain in paper)

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Encode feature1
df['feature1_encoded'] = le.fit_transform(df['combined'])

# Encode feature2
df['feature2_encoded'] = le.fit_transform(df['genre_prediction'])

# Extract the encoded features
X = df[['feature1_encoded']]
y = df['feature2_encoded']

# Calculate mutual information
mi = mutual_info_classif(X, y, discrete_features=True)

print(f'Mutual Information between feature1 and feature2: {mi[0]}')



Mutual Information between feature1 and feature2: 0.10867582104214837


## Results in log e

For full data with explode = True: 

| register entropy | genre entropy | combined entropy |
|:--------:|:--------:|:--------:|
| 1.4490346714406952 | 1.5454385825583463 | 2.9342604164030393 |

For full data with explode = False (all multilabels removed)
| register entropy | genre entropy | combined entropy |
|:--------:|:--------:|:--------:|
| 1.3558207979577448 | 1.5650504056294658 | 2.8449235031620446 |


## Results in bits

| register entropy | genre entropy | combined entropy |
|:--------:|:--------:|:--------:|
| 2.0905151345636597 | 2.2295965790553947 | 4.2332429514214525|

## With subregisters
explode = True

| register entropy | genre entropy | combined entropy |
|:--------:|:--------:|:--------:|
|3.319881309280197 | 2.227648279045393 | 5.40375038782302|


with Na-sr, SP-it, SP setup:
| register entropy | genre entropy | combined entropy |
|:--------:|:--------:|:--------:|
|3.3704736853134243 | 2.2294419538485077 | 5.44312957107989|
