# Load Dataset

In [30]:
import pandas as pd
from io import StringIO
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset


In [2]:
# Load the JSON Lines file into a DataFrame
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)

# Display the first few rows
df.head()


# import pandas as pd
# import json

# # Manually read the file and parse each line into a list of dictionaries
# data_path = "../data/jutsus.jsonl"
# with open(data_path, 'r') as file:
#     data = [json.loads(line) for line in file if line.strip()]  # Skip empty lines

# # Convert the list of dictionaries into a DataFrame
# df = pd.DataFrame(data)

# # Display the first few rows
# df.head()


Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Azure Stream,"Hiden, Ninjutsu, Kenjutsu",Suigetsu creates a moving body of water beneat...
1,Asura Attack,"Kekkei Genkai, Ninjutsu, Dōjutsu","With the body modifications of the Asura Path,..."
2,100% Single Punch,Taijutsu,Tsunade gathers large amounts of chakra in her...
3,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
4,Attack of the Twin Demons,"Kekkei Genkai, Ninjutsu",This technique allows Ukon to inhabit his brot...


In [3]:
# data_path = "../data/jutsus.jsonl"

# with open(data_path, 'r') as file:
#     lines = file.readlines()
#     for line in lines[:5]:  # Print the first 5 lines to inspect
#      print(line)


In [4]:
def simplify_justsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [5]:
df['jutsu_type_Simplified'] = df['jutsu_type'].apply(simplify_justsu)
df

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_Simplified
0,Azure Stream,"Hiden, Ninjutsu, Kenjutsu",Suigetsu creates a moving body of water beneat...,Ninjutsu
1,Asura Attack,"Kekkei Genkai, Ninjutsu, Dōjutsu","With the body modifications of the Asura Path,...",Ninjutsu
2,100% Single Punch,Taijutsu,Tsunade gathers large amounts of chakra in her...,Taijutsu
3,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
4,Attack of the Twin Demons,"Kekkei Genkai, Ninjutsu",This technique allows Ukon to inhabit his brot...,Ninjutsu
...,...,...,...,...
2920,Absorption Sphere,Ninjutsu,"Using the Jutsu Absorption Arm, the user creat...",Ninjutsu
2921,Absolute: Fang Passing Fang,"Taijutsu, Collaboration Techniques",Kiba and Akamaru perform the Fang Passing Fang...,Taijutsu
2922,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...,Taijutsu
2923,16 Hit Combo,Taijutsu,"A very effective move, Ino uses this as a quic...",Taijutsu


In [6]:
df['jutsu_type_Simplified'].value_counts()

#skewed dataset


jutsu_type_Simplified
Ninjutsu    2258
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [7]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsu'] = df['jutsu_type_Simplified']
df = df[['text', 'jutsu']]
df = df.dropna()

In [8]:
df.head()

Unnamed: 0,text,jutsu
0,Azure Stream. Suigetsu creates a moving body o...,Ninjutsu
1,Asura Attack. With the body modifications of t...,Ninjutsu
2,100% Single Punch. Tsunade gathers large amoun...,Taijutsu
3,10 Hit Combo. Lars punches the opponent before...,Taijutsu
4,Attack of the Twin Demons. This technique allo...,Ninjutsu


In [9]:
#cleaning purpose

from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass 
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [10]:
text_column_name = 'text'
label_column_name = 'jutsu'


In [11]:
df.head(3)

Unnamed: 0,text,jutsu
0,Azure Stream. Suigetsu creates a moving body o...,Ninjutsu
1,Asura Attack. With the body modifications of t...,Ninjutsu
2,100% Single Punch. Tsunade gathers large amoun...,Taijutsu


In [12]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [13]:
df.head(3)

Unnamed: 0,text,jutsu,text_cleaned
0,Azure Stream. Suigetsu creates a moving body o...,Ninjutsu,Azure Stream. Suigetsu creates a moving body o...
1,Asura Attack. With the body modifications of t...,Ninjutsu,Asura Attack. With the body modifications of t...
2,100% Single Punch. Tsunade gathers large amoun...,Taijutsu,100% Single Punch. Tsunade gathers large amoun...


In [14]:
# Encode Labels 
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [15]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [16]:
df['label'] = le.transform(df[label_column_name].tolist())

In [22]:
df.sample(4)

Unnamed: 0,text,jutsu,text_cleaned,label
2229,Fire Release: Big Flame Bullet. This technique...,Ninjutsu,Fire Release: Big Flame Bullet. This technique...,1
16,Cat Genjutsu. The user uses a genjutsu where t...,Genjutsu,Cat Genjutsu. The user uses a genjutsu where t...,0
1799,Lava Release: Scorching Flow Peak Rock. Rōshi ...,Ninjutsu,Lava Release: Scorching Flow Peak Rock. Rōshi ...,1
835,Truth-Seeking: Pure Shadow. Obito encases hims...,Ninjutsu,Truth-Seeking: Pure Shadow. Obito encases hims...,1


In [24]:
# training and testing

test_size = 0.2
df_train, df_test = train_test_split(df,
                                     test_size = test_size,
                                     stratify=df['label'])

In [26]:
df_train['jutsu'].value_counts()

jutsu
Ninjutsu    1806
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [28]:
model_name = "distilbert/distilbert-base-uncased"

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [32]:
def preprocess_function(tokenizer,examples):
    return tokenizer(examples['text_cleaned'],truncation=True)

In [33]:
# Conver Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)

Map: 100%|██████████| 2205/2205 [00:00<00:00, 10744.92 examples/s]
Map: 100%|██████████| 552/552 [00:00<00:00, 10776.20 examples/s]
