In [49]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

## Load our crawled dataset :

In [10]:
dataset_path = "../data/crawled_data/jutsus.jsonl"
df = pd.read_json(dataset_path , lines = True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Amaterasu,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the ninjutsu Amaterasu. ...
2,Amaterasu: Wailing Sky,"Kekkei Genkai, Dōjutsu, Ninjutsu","Using Susanoo, Indra sends Amaterasu to the sk..."
3,Amenominaka,"Kekkei Mōra, Ninjutsu, Space–Time Ninjutsu, Dō...","Using her Rinne Sharingan, Kaguya Ōtsutsuki re..."
4,Amaterasu: Flame Wrapping Fire,"Kekkei Genkai, Ninjutsu, Dōjutsu",The manipulated flames of Amaterasu are used t...


In [11]:
def simplify_jutsu(jutsu):
    if "Taijutsu" in jutsu:
        return "Taijutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Genjutsu" in jutsu:
        return "Genjutsu"

In [12]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Amaterasu,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the ninjutsu Amaterasu. ...,Ninjutsu
2,Amaterasu: Wailing Sky,"Kekkei Genkai, Dōjutsu, Ninjutsu","Using Susanoo, Indra sends Amaterasu to the sk...",Ninjutsu
3,Amenominaka,"Kekkei Mōra, Ninjutsu, Space–Time Ninjutsu, Dō...","Using her Rinne Sharingan, Kaguya Ōtsutsuki re...",Ninjutsu
4,Amaterasu: Flame Wrapping Fire,"Kekkei Genkai, Ninjutsu, Dōjutsu",The manipulated flames of Amaterasu are used t...,Ninjutsu


In [13]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2044
Taijutsu     637
Genjutsu      82
Name: count, dtype: int64

### - Skewed dataset !

In [14]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [15]:
df.head()

Unnamed: 0,text,jutsus
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
1,Amaterasu. This article is about the ninjutsu ...,Ninjutsu
2,"Amaterasu: Wailing Sky. Using Susanoo, Indra s...",Ninjutsu
3,"Amenominaka. Using her Rinne Sharingan, Kaguya...",Ninjutsu
4,Amaterasu: Flame Wrapping Fire. The manipulate...,Ninjutsu


### some cleaning text column :

In [16]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass 
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

  return text.replace("<\p>", "<\p>\n")
  return text.replace("<\p>", "<\p>\n")


In [29]:
text_column_name = 'text'
label_column_name = 'jutsus'

In [31]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)
df.head()


  clean_text = BeautifulSoup(text, "lxml").text


Unnamed: 0,text,jutsus,text_cleaned
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
1,Amaterasu. This article is about the ninjutsu ...,Ninjutsu,Amaterasu. This article is about the ninjutsu ...
2,"Amaterasu: Wailing Sky. Using Susanoo, Indra s...",Ninjutsu,"Amaterasu: Wailing Sky. Using Susanoo, Indra s..."
3,"Amenominaka. Using her Rinne Sharingan, Kaguya...",Ninjutsu,"Amenominaka. Using her Rinne Sharingan, Kaguya..."
4,Amaterasu: Flame Wrapping Fire. The manipulate...,Ninjutsu,Amaterasu: Flame Wrapping Fire. The manipulate...


### Encode Label  :

In [35]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [34]:
df[label_column_name]

0       Taijutsu
1       Ninjutsu
2       Ninjutsu
3       Ninjutsu
4       Ninjutsu
          ...   
2926    Ninjutsu
2927    Taijutsu
2928    Taijutsu
2929    Taijutsu
2930    Ninjutsu
Name: jutsus, Length: 2763, dtype: object

In [36]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [37]:
df['label'] = le.transform(df[label_column_name].tolist())
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
1,Amaterasu. This article is about the ninjutsu ...,Ninjutsu,Amaterasu. This article is about the ninjutsu ...,1
2,"Amaterasu: Wailing Sky. Using Susanoo, Indra s...",Ninjutsu,"Amaterasu: Wailing Sky. Using Susanoo, Indra s...",1
3,"Amenominaka. Using her Rinne Sharingan, Kaguya...",Ninjutsu,"Amenominaka. Using her Rinne Sharingan, Kaguya...",1
4,Amaterasu: Flame Wrapping Fire. The manipulate...,Ninjutsu,Amaterasu: Flame Wrapping Fire. The manipulate...,1


In [39]:
df_train , df_test = train_test_split(
    df,
    test_size =.2,
    random_state = 42,
    stratify=df['label']
)

####    by adding  stratify :  ensure that the training and testing sets have approximately the same distribution of classes as the original dataset.

In [42]:
df['jutsus'].value_counts()

jutsus
Ninjutsu    2044
Taijutsu     637
Genjutsu      82
Name: count, dtype: int64

In [41]:
df_train['jutsus'].value_counts()

jutsus
Ninjutsu    1635
Taijutsu     509
Genjutsu      66
Name: count, dtype: int64

In [43]:
model_name = "distilbert/distilbert-base-uncased"

In [46]:
# Load the tokenizer for the  model  
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [47]:
def preprocess_function(tokenizer,examples):
    return tokenizer(examples['text_cleaned'],truncation=True)

In [50]:
# Conver Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [51]:
# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                                                      batched=True)

Map: 100%|██████████| 2210/2210 [00:00<00:00, 3295.16 examples/s]
Map: 100%|██████████| 553/553 [00:00<00:00, 3425.92 examples/s]
