In [2]:
import pandas as pd
from collections import Counter
import os
from pathlib import Path
import numpy as np
import json
d_path = Path("data")

In [9]:
for f in os.listdir(d_path / "processed"):
    f_path = d_path / "processed" / f
    with open(f_path, "r") as f:
        arr = json.load(f)
    
    with open(f_path, "w") as f:
        json.dump(arr, f, indent=4)

In [3]:
with open(d_path / "processed" / "ae_processed_test.json", "r") as f:
    arr = json.load(f)

In [7]:
with open(d_path / "example_input.json", "w") as f:
    json.dump(arr[:4], f, indent=4)

In [2]:
trains = [f for f in os.listdir("data/") if f.endswith("train.json")]
vals = [f for f in os.listdir("data/") if f.endswith("validation.json")]
tests = [f for f in os.listdir("data/") if f.endswith("test.json")]

In [3]:
cats = []

for fs in zip(trains, vals, tests):
    d = pd.concat([pd.read_json(d_path / f) for f in fs])
    cats.append(set(d["category"]))

sub_cats = {
    "ae": cats[0],
    "lp": cats[1],
    "sc": cats[2]
}

In [15]:
train_df = pd.concat([pd.read_json(d_path / f, dtype={"id": str}) for f in trains])
val_df = pd.concat([pd.read_json(d_path / f, dtype={"id": str}) for f in vals])
test_df = pd.concat([pd.read_json(d_path / f, dtype={"id": str}) for f in tests])

In [37]:
sub_cats

{'ae': {'anime',
  'boardgames',
  'gaming',
  'literature',
  'movies',
  'music',
  'musicfans',
  'rpg',
  'scifi',
  'sound'},
 'lp': {'bicycles',
  'cooking',
  'diy',
  'fitness',
  'freelancing',
  'gardening',
  'health',
  'lifehacks',
  'martialarts',
  'outdoors',
  'parenting',
  'pets',
  'sports',
  'sustainability',
  'travel',
  'woodworking',
  'workplace',
  'writers'},
 'sc': {'academia',
  'buddhism',
  'christianity',
  'english',
  'expatriates',
  'genealogy',
  'hermeneutics',
  'hinduism',
  'history',
  'interpersonal',
  'islam',
  'judaism',
  'law',
  'linguistics',
  'money',
  'philosophy',
  'politics',
  'skeptics',
  'vegetarianism'}}

In [5]:
train_df.loc[0]["profile"]

[{'category': 'movies',
  'id': '0_0_0_0',
  'text': "What happened to the natives? In Aguirre: The Wrath of God, two natives approach the raft, and are taken aboard.  After the priest on the raft attempts to convert them by showing them the bible and saying that it is the word of God, one of the natives tries to listen to it, and complains he can't hear anything.  This makes the people on the raft upset, and they gather around, yelling at the natives, who crouch down, presumably in fright.Then, in the next scene, they are gone for good.  Were we supposed to infer that they were killed?  The movie didn't shy away from showing other deaths.  But it's not obvious that they could have gone anywhere.Did I miss something?"},
 {'category': 'anime',
  'id': '0_0_0_1',
  'text': "Do we know what happens to Spike at the end of Cowboy Bebop? At the end of the last episode of Cowboy Bebop, Spike collapses.  It's not clear, but it looks as though he may be dead.  Is there a way to know whether he 

In [17]:
def add_subcats(row, sub_cats):
    category_array = []
    profile = row["profile"]
    for post in profile:
        subcat = post["category"]

        if subcat in sub_cats["ae"]:
            category_array.append("ae")
        if subcat in sub_cats["lp"]:
            category_array.append("lp")
        if subcat in sub_cats["sc"]:
            category_array.append("sc")
    return category_array

train_df["category_array"] = train_df.apply(lambda x: add_subcats(x, sub_cats), axis=1)
val_df["category_array"] = val_df.apply(lambda x: add_subcats(x, sub_cats), axis=1)
test_df["category_array"] = test_df.apply(lambda x: add_subcats(x, sub_cats), axis=1)

len(train_df), len(val_df), len(test_df)

(24333, 2503, 2830)

In [18]:
def check_3_categories(row):
    return set(row["category_array"]) == {"ae", "lp", "sc"}

train_df = train_df[train_df.apply(check_3_categories, axis=1)]
val_df = val_df[val_df.apply(check_3_categories, axis=1)]
test_df = test_df[test_df.apply(check_3_categories, axis=1)]

len(train_df), len(val_df), len(test_df)

(11077, 1113, 1351)

In [19]:
def entropy(category_array):
    counts = np.array(list(Counter(category_array).values()))
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

train_df["entropy"] = train_df["category_array"].apply(entropy)
val_df["entropy"] = val_df["category_array"].apply(entropy)
test_df["entropy"] = test_df["category_array"].apply(entropy)

In [22]:
train_df.drop(columns=["category_array"], inplace=True)
val_df.drop(columns=["category_array"], inplace=True)
test_df.drop(columns=["category_array"], inplace=True)

In [24]:
ae_train = train_df[train_df["category"].isin(sub_cats["ae"])]
lp_train = train_df[train_df["category"].isin(sub_cats["lp"])]
sc_train = train_df[train_df["category"].isin(sub_cats["sc"])]

ae_val = val_df[val_df["category"].isin(sub_cats["ae"])]
lp_val = val_df[val_df["category"].isin(sub_cats["lp"])]
sc_val = val_df[val_df["category"].isin(sub_cats["sc"])]

ae_test = test_df[test_df["category"].isin(sub_cats["ae"])]
lp_test = test_df[test_df["category"].isin(sub_cats["lp"])]
sc_test = test_df[test_df["category"].isin(sub_cats["sc"])]

In [33]:
ae_train.to_json(d_path / "processed" / "ae_processed_train.json", orient="records", indent=2)
ae_val.to_json(d_path / "processed" / "ae_processed_validation.json", orient="records", indent=2)
ae_test.to_json(d_path / "processed" / "ae_processed_test.json", orient="records", indent=2)

lp_train.to_json(d_path / "processed" / "lp_processed_train.json", orient="records", indent=2)
lp_val.to_json(d_path / "processed" / "lp_processed_validation.json", orient="records", indent=2)
lp_test.to_json(d_path / "processed" / "lp_processed_test.json", orient="records", indent=2)

sc_train.to_json(d_path / "processed" / "sc_processed_train.json", orient="records", indent=2)
sc_val.to_json(d_path / "processed" / "sc_processed_validation.json", orient="records", indent=2)
sc_test.to_json(d_path / "processed" / "sc_processed_test.json", orient="records", indent=2)

In [34]:
t = ae_test[:10]

In [35]:
t.head()

Unnamed: 0,id,question,profile,rubric_aspects,narrative,category,entropy
0,0_2_0,Are the two music notes a reference?,"[{'category': 'anime', 'id': '0_2_0_0', 'text'...",[{'aspect': 'Visual appearance of the music no...,"In episode 2 of Idolish7, 11:08 the following ...",anime,0.725376
4,0_2_4,How come Neiru hasn't revived her sister yet?,"[{'category': 'skeptics', 'id': '0_2_4_0', 'te...",[{'aspect': 'Neiru's head start with the Wonde...,"In Wonder Egg Priority, when Neiru is first in...",anime,1.077291
9,0_2_9,Was Saitama sweating with worry when he saw Ga...,"[{'category': 'anime', 'id': '0_2_9_0', 'text'...",[{'aspect': 'Interpretation of Saitama's facia...,"In One Punch Man manga chapter #159, Saitama a...",anime,0.687355
11,0_2_11,In the 3rd game when Eva calls Hideyoshi as a ...,"[{'category': 'law', 'id': '0_2_11_0', 'text':...",[{'aspect': 'Purpose of calling a 'witness' fr...,"In the anime (episode 18, part of the 3rd arc ...",anime,1.2798
12,0_2_12,Why does Miyuki lose when Miyuki is hardly in ...,"[{'category': 'academia', 'id': '0_2_12_0', 't...","[{'aspect': 'Miyuki's Loss Confirmation', 'evi...",S01E08 story of the last sub-episode - Miyuki ...,anime,1.2798


In [36]:
t.to_json(d_path / "processed" / "test.json", orient="records", indent=2)