# Pokémon dataset exploration

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer
from nltk.stem import RSLPStemmer

In [2]:
df = pd.read_json("../datasets/pokedex-db.json")
df

Unnamed: 0,id,name,description,type-main,type-second
0,1,Bulbasaur,"While it is young, it uses the nutrients that ...",Grass,Poison
1,32,Nidoran-M,"Small but brave, this Pokémon will hold its gr...",Poison,
2,35,Clefairy,Its adorable behavior and cry make it highly p...,Fairy,
3,38,Ninetales,Very smart and very vengeful. Grabbing one of ...,Fire,
4,31,Nidoqueen,It pacifies offspring by placing them in the g...,Poison,Ground
...,...,...,...,...,...
900,5,Charmeleon,"If it becomes agitated during battle, it spout...",Fire,
901,4,Charmander,"From the time it is born, a flame burns at the...",Fire,
902,2,Ivysaur,Exposure to sunlight adds to its strength. Sun...,Grass,Poison
903,3,Venusaur,A bewitching aroma wafts from its flower. The ...,Grass,Poison


In [10]:
df_pokemon = df[["id", "name"]]

In [3]:
df.rename(columns = {'description':'text'}, inplace = True)

In [17]:
df

Unnamed: 0,id,name,text,type-main,type-second
0,1,Bulbasaur,"While it is young, it uses the nutrients that ...",Grass,Poison
1,32,Nidoran-M,"Small but brave, this Pokémon will hold its gr...",Poison,
2,35,Clefairy,Its adorable behavior and cry make it highly p...,Fairy,
3,38,Ninetales,Very smart and very vengeful. Grabbing one of ...,Fire,
4,31,Nidoqueen,It pacifies offspring by placing them in the g...,Poison,Ground
...,...,...,...,...,...
900,5,Charmeleon,"If it becomes agitated during battle, it spout...",Fire,
901,4,Charmander,"From the time it is born, a flame burns at the...",Fire,
902,2,Ivysaur,Exposure to sunlight adds to its strength. Sun...,Grass,Poison
903,3,Venusaur,A bewitching aroma wafts from its flower. The ...,Grass,Poison


In [4]:
def to_lower(text:str)-> str:
    """
    Get input sentence and returns all words as lowercase
    """
    return text.lower()

def remove_symbols(text:str)-> str:
    """
    Get the input text and replaces all symbols from english language for spaces.
    This procedure aims to later remove the extra spaces when tokenizing words.
    """
    # text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'([!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~])', ' ', text)
    return text

def word_tokenize(text:str)->list:
    """
    Break the input string text into word tokens, removing spaces between them.
    """
    words = nltk.word_tokenize(text)
    return words

def remove_stopwords(words:list)->list:
    """
    Remove stopwords from english text from a list of words.
    """
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

def remove_numbers(words:list)->list:
    """
    Remove tokens that are only composed by numbers, since they present almost none semantic information.
    """
    words = [w for w in words if not w.isdecimal()]
    return words

def apply_stemming(words:list)->list:
    """
    Apply a stemmer technique in the tokenized words.
    TODO: Implement other stemming techniques as option.
    """
    stemmer = RSLPStemmer()
    stemmed = [stemmer.stem(word) for word in words]
    return stemmed

def remove_single_letters(words:list)->list:
    """
    Similar to numbers, remove words with only one letter.
    """
    words = [w for w in words if len(w) > 1]
    return words

def process_sentence(text:str,
                     process_symbols:bool = True,
                     process_stopwords:bool = True,
                     process_numbers:bool = True,
                     process_single_letters:bool = True)->list:
    """
    Get a raw sentence and applies all preprocessing stages.
    """
    text = to_lower(text)
    
    if process_symbols:
        text = remove_symbols(text)
    
    words = word_tokenize(text)
    # print("without symbols", len(words))
    
    if process_stopwords:
        words = remove_stopwords(words)
    # print("without stopwords", len(words))
    
    if process_numbers:
        words = remove_numbers(words)
    # print("without numbers", len(words))
    
    if process_single_letters:
        words = remove_single_letters(words)
    
    words = apply_stemming(words)
    
    return words

def process_dataset(dataset: pd.core.frame.DataFrame,
                    process_symbols:bool = True,
                    process_stopwords:bool = True,
                    process_numbers:bool = True,
                    process_single_letters:bool = True,
                    debug:bool = True) -> list:
    """
    Apply the sentence preprocessing over a complete pd.DataFrame dataset.
    """
    processed = []
    total = len(dataset)
    for index, row in dataset.iterrows():
        if debug:
            print(f"Processing {index+1}/{total}", end="\r")
        processed.append(process_sentence(row["text"],
                         process_symbols,
                         process_stopwords,
                         process_numbers,
                         process_single_letters))
    
    return processed

def create_word_corpus(dataset:list,
                       min_percentage:float = 0.01,
                       max_percentage:float = 0.8)->tuple:
    """
    Creates a word_corpus selection based on a processed dataset.
    It also returns the term_count for the corpus, allowing for later document encoding
    """
    raw_word_corpus = [w for item in dataset for w in item]
    raw_word_corpus = list(set(raw_word_corpus))

    term_count = {w:0 for w in raw_word_corpus}
    for text in dataset:
        for word in text:
            term_count[word] += 1
    sorted_term_count = {k: v for k, v in sorted(term_count.items(), key=lambda item: item[1], reverse=True)}
    
    term_df = pd.DataFrame(list(sorted_term_count.items()), columns=["word", "count"])
    term_df["average_count"] = term_df["count"]/len(dataset)
    
    word_corpus_term_count = term_df[term_df["average_count"].between(min_percentage, max_percentage)]
    word_corpus = word_corpus_term_count["word"].to_list()

    return word_corpus, word_corpus_term_count

In [5]:
dataset = process_dataset(df)


Processing 905/905

In [7]:
corpus, corpus_df = create_word_corpus(dataset,
                                       min_percentage=0.003,
                                       max_percentage=0.8)
corpus_df

Unnamed: 0,word,count,average_count
1,body,281,0.310497
2,pow,169,0.186740
3,prey,143,0.158011
4,one,122,0.134807
5,energy,118,0.130387
...,...,...,...
1899,scoop,3,0.003315
1900,amplify,3,0.003315
1901,situation,3,0.003315
1902,cleaning,3,0.003315


In [13]:
corpus_df.to_csv("../datasets/pokedex-word-corpus-1903.csv", index=False)

In [37]:
with open('../datasets/pokedex-processed.pickle', 'wb') as f:
    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)

In [23]:
.clen(corpus[0])

5090

In [27]:
corpus_df["average_count"].describe()

count    5090.000000
mean        0.005629
std         0.017912
min         0.001105
25%         0.001105
50%         0.002210
75%         0.004420
max         0.944751
Name: average_count, dtype: float64

In [41]:
classes = []
classes.append(df["type-main"].to_list())
classes.append(df["type-second"].to_list())
classes = np.asarray(classes)
classes.shape

(2, 905)

In [43]:
np.save("../datasets/pokedex-classes", classes)

In [38]:
dataset

Unnamed: 0,id,name,text,type-main,type-second
0,1,Bulbasaur,"While it is young, it uses the nutrients that ...",Grass,Poison
1,32,Nidoran-M,"Small but brave, this Pokémon will hold its gr...",Poison,
2,35,Clefairy,Its adorable behavior and cry make it highly p...,Fairy,
3,38,Ninetales,Very smart and very vengeful. Grabbing one of ...,Fire,
4,31,Nidoqueen,It pacifies offspring by placing them in the g...,Poison,Ground
...,...,...,...,...,...
900,5,Charmeleon,"If it becomes agitated during battle, it spout...",Fire,
901,4,Charmander,"From the time it is born, a flame burns at the...",Fire,
902,2,Ivysaur,Exposure to sunlight adds to its strength. Sun...,Grass,Poison
903,3,Venusaur,A bewitching aroma wafts from its flower. The ...,Grass,Poison
