In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import  AutoTokenizer
from datasets import Dataset

In [2]:
data_path = '../data/jutsus.jsonl'
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Adamantine Sealing Chains: Spiral Formation,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...",Kushina uses her chains to form a barrier whil...
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...
2,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ..."
3,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u..."
4,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ..."


In [10]:
def get_jutsu_type(jutsu_type):
    if 'Ninjutsu' in jutsu_type:
        return 'Ninjutsu'
    if 'Taijutsu' in jutsu_type:
        return 'Taijutsu'
    if 'Genjutsu' in jutsu_type:
        return 'Genjutsu'
    return None

In [11]:
df['jutsu_type_simplefied'] = df['jutsu_type'].apply(get_jutsu_type)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplefied
0,Adamantine Sealing Chains: Spiral Formation,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...",Kushina uses her chains to form a barrier whil...,Ninjutsu
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...,Ninjutsu
2,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ...",Ninjutsu
3,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u...",Ninjutsu
4,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ...",Taijutsu


In [17]:
df.dropna(inplace=True)
df['jutsu_text'] = df['jutsu_name'] + df['jutsu_description']
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplefied,jutsu_text
0,Adamantine Sealing Chains: Spiral Formation,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...",Kushina uses her chains to form a barrier whil...,Ninjutsu,Adamantine Sealing Chains: Spiral FormationKus...
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...,Ninjutsu,Adamantine Power: AcalaHashirama kicks the opp...
2,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ...",Ninjutsu,Adamantine Prison WallAfter using Transformati...
3,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u...",Ninjutsu,Adamantine Seal: Monkey Yang SuppressionAfter ...
4,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ...",Taijutsu,"AcrobatThe Acrobat (荒繰鷺伐刀, Akurobatto) is a ke..."


In [18]:
df_dataset = df[['jutsu_text', 'jutsu_type_simplefied']]
df_dataset.head()

Unnamed: 0,jutsu_text,jutsu_type_simplefied
0,Adamantine Sealing Chains: Spiral FormationKus...,Ninjutsu
1,Adamantine Power: AcalaHashirama kicks the opp...,Ninjutsu
2,Adamantine Prison WallAfter using Transformati...,Ninjutsu
3,Adamantine Seal: Monkey Yang SuppressionAfter ...,Ninjutsu
4,"AcrobatThe Acrobat (荒繰鷺伐刀, Akurobatto) is a ke...",Taijutsu


In [50]:
class Cleaner():

    def __init__(self):
        pass 

    def replace_paragraph(self, text: str):
        return text.replace('<\p>', '<\p>\n')
    
    def remove_html_tags(self, text: str):
        return BeautifulSoup(text, 'lxml').text
    
    def clean(self, text: str):
        text = self.replace_paragraph(text)
        text = self.remove_html_tags(text)
        return text.strip()

In [51]:
cleaner = Cleaner()
df_dataset['jutsu_text_cleaned'] = df_dataset['jutsu_text'].apply(cleaner.clean) 
df_dataset.head()

  return BeautifulSoup(text, 'lxml').text
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dataset['jutsu_text_cleaned'] = df_dataset['jutsu_text'].apply(cleaner.clean)


Unnamed: 0,jutsu_text,jutsu_type_simplefied,labels,jutsu_text_cleaned
0,Adamantine Sealing Chains: Spiral FormationKus...,Ninjutsu,1,Adamantine Sealing Chains: Spiral FormationKus...
1,Adamantine Power: AcalaHashirama kicks the opp...,Ninjutsu,1,Adamantine Power: AcalaHashirama kicks the opp...
2,Adamantine Prison WallAfter using Transformati...,Ninjutsu,1,Adamantine Prison WallAfter using Transformati...
3,Adamantine Seal: Monkey Yang SuppressionAfter ...,Ninjutsu,1,Adamantine Seal: Monkey Yang SuppressionAfter ...
4,"AcrobatThe Acrobat (荒繰鷺伐刀, Akurobatto) is a ke...",Taijutsu,2,"AcrobatThe Acrobat (荒繰鷺伐刀, Akurobatto) is a ke..."


In [35]:
encoder = LabelEncoder()
encoder.fit(df_dataset['jutsu_type_simplefied'])
label_dict = {index: label for index, label in enumerate(encoder.classes_)}
df_dataset['labels'] = encoder.transform(df_dataset['jutsu_type_simplefied'])
df_dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dataset['labels'] = encoder.transform(df_dataset['jutsu_type_simplefied'])


Unnamed: 0,jutsu_text,jutsu_type_simplefied,labels
0,Adamantine Sealing Chains: Spiral FormationKus...,Ninjutsu,1
1,Adamantine Power: AcalaHashirama kicks the opp...,Ninjutsu,1
2,Adamantine Prison WallAfter using Transformati...,Ninjutsu,1
3,Adamantine Seal: Monkey Yang SuppressionAfter ...,Ninjutsu,1
4,"AcrobatThe Acrobat (荒繰鷺伐刀, Akurobatto) is a ke...",Taijutsu,2


In [37]:
text = 'jutsu_text'
label = 'labels'
dataset = df_dataset[[text, label]]
dataset.head()

Unnamed: 0,jutsu_text,labels
0,Adamantine Sealing Chains: Spiral FormationKus...,1
1,Adamantine Power: AcalaHashirama kicks the opp...,1
2,Adamantine Prison WallAfter using Transformati...,1
3,Adamantine Seal: Monkey Yang SuppressionAfter ...,1
4,"AcrobatThe Acrobat (荒繰鷺伐刀, Akurobatto) is a ke...",2


In [44]:
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, stratify=dataset[label])

In [49]:
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

In [None]:
def tokenization(tokenizer, example):
    return tokenizer(example, padding="max_length", trancation=True)

train_dataset_tokenized = train_dataset.map(lambda x: tokenization(tokenizer, x[0]), batched=True)
test_dataset_tokenized = test_dataset.maps(lambda x: tokenization(tokenizer, x[0]), batched=True)

In [16]:
df.isna().sum()

jutsu_name               0
jutsu_type               0
jutsu_description        0
jutsu_type_simplefied    0
dtype: int64