# Trabajo Pokemon - Minería de Textos
Tipo de Pokemon: **Bicho**

Integrantes: Lucía Parreño Legorburo, Eduardo Hortelano Pérez, Pablo de la Iglesia Otero

In [14]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m110.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [27]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import spacy
from collections import Counter
import sklearn as sk
from sklearn.neighbors import KNeighborsClassifier

#Both types must be the same example: "Tipo Bicho" and "bug"
#searched_type must be in spanish as we are using wikidex (the spanish model)
searched_type = "Tipo Bicho"
#power_type must be in english as the API would be used
power_type = "bug"

url_power = "https://raw.githubusercontent.com/lucia-pl/MT_Pokemon/refs/heads/docs/power_moves_bug.csv"
url_moves = "https://raw.githubusercontent.com/lucia-pl/MT_Pokemon/refs/heads/docs/pokemon_moves.csv"

nlp_en = spacy.load("en_core_web_sm")

# CARGA DE DATOS

Carga de pokemon a través de Wikidex:

In [16]:
# Access to all pokemon in wikidex given a list of generations and a type of pokemon
# Returns all of the names of the pokemons which match generation and type
def all_pokemon_from_type(generacion: str, pokemon_type: str):
  url = f"https://www.wikidex.net/wiki/Lista_de_Pok%C3%A9mon_de_la_{generacion}_generaci%C3%B3n"
  response = requests.get(url)
  bs = BeautifulSoup(response.text, "html.parser")

  boxes = bs.find_all("tr")
  result = []

  for box in boxes:
        pokemon_t = box.find("a", {"title": pokemon_type})
        if pokemon_t is not None:
            name_tag = box.find("a")["title"]
            if name_tag != pokemon_type:
                result.append(name_tag)
  return result

# Given a type of pokemon (grass, fire, bug...) it finds all of those pokemon calling "all_pokemon_from_type" using the list of generations created inside
# Returns a list of pokemons
def type_selection(pokemon_type: str):
  generations=["primera", "segunda"]
  result=[]
  for g in generations:
    result.extend(all_pokemon_from_type(g, pokemon_type))
  return result


Carga de movimientos a través de la API:

In [17]:
#Given a lists called "effect_entries" finds if each entry is written in english.
#Returns the "entry" if is written in english or a text saying it couldn't be found if it isn´t in english or is not found
def get_descripcion_en(effect_entries):
    for entry in effect_entries:
        if entry['language']['name'] == 'en':
            return entry['short_effect'].strip()
    return "Descripción en inglés no encontrada."

#Goes through the API to find all the moves pokemons can have, it stores the name and power and access to the description using "get_description_en"
#it changes power from None to 0 and finally adds all the variables to their corresponding list
#Returns all the lists

#It is not being activelly used in the code as it takes a lot of time to upload all the moves and sometimes may cause overload for the API
#as a replacement an url with the moves extracted from this code is used
def obtener_movimientos_api():

    lista_nombres = []
    lista_descripciones = []
    lista_potencias = []

    response = requests.get("https://pokeapi.co/api/v2/move/?limit=1000", timeout=10)
    all_moves = response.json()['results']


    for move in all_moves:
        try:
            response = requests.get(move['url'], timeout=10)
            datos = response.json()

            nombre = datos['name']
            potencia = datos['power']
            descripcion = get_descripcion_en(datos['effect_entries'])

            if potencia is None:
                potencia = 0

            lista_nombres.append(nombre)
            lista_descripciones.append(descripcion)
            lista_potencias.append(potencia)

        except:
            continue

    return lista_nombres, lista_descripciones, lista_potencias

#Creates a dataframe with the lists obtained from "obtener_movimientos_api"
#Returns the dataframe with four columns: nombre(move name), descripcion(move description), clase(move type [defensive 0 or offensive 1]), potencia(power)
def crear_db():
    nombres, descripciones, potencias = obtener_movimientos_api()

    clase = []
    for p in potencias:
        if p is None or p == 0:
            clase.append(0)
        else:
            clase.append(1)

    datos_df = {
        'nombre': nombres,
        'descripcion': descripciones,
        'clase': clase,
        'potencia': potencias
    }

    df = pd.DataFrame(datos_df)
    return df

#Given the name of a pokemon finds all the possible moves it can learn
#Returns a dataframe with all the possible moves, the structure is the same as "crear_db"
def obtener_movimientos_pokemon(nombre_pokemon):

    lista_nombres = []
    lista_descripciones = []
    lista_potencias = []
    lista_tipos = []
    lista_clase = []
    nombre_pokemon = nombre_pokemon.lower().strip()


    url_pokemon = f"https://pokeapi.co/api/v2/pokemon/{nombre_pokemon}"
    response = requests.get(url_pokemon, timeout=10)
    datos_pokemon = response.json()
    moves= datos_pokemon['moves']
    for i, item in enumerate(moves):
        try:
            move_url = item['move']['url']

            r = requests.get(move_url, timeout=10)
            datos_move = r.json()

            nombre = datos_move['name']
            potencia = datos_move['power']
            descripcion = get_descripcion_en(datos_move['effect_entries'])

            if potencia is None or potencia == 0:
                clase = 0
            else:
                clase = 1

            lista_nombres.append(nombre)
            lista_descripciones.append(descripcion)
            lista_potencias.append(potencia)
            lista_clase.append(clase)

        except:
            continue


    datos_df = {
        'nombre': lista_nombres,
        'descripcion': lista_descripciones,
        'clase': lista_clase,
        'potencia': lista_potencias,
    }

    return pd.DataFrame(datos_df)

# EQUIPO DEFENSIVO

In [18]:
#Finds the stats for a pokemon given its name and the type of stats we want to find
#Returns a list with the three stats asked for
def all_stats(name, team_stats):
    url = f'https://pokeapi.co/api/v2/pokemon/{name}'
    response = requests.get(url).json()
    stat_map = {s["stat"]["name"]: s["base_stat"] for s in response["stats"]}

    return [stat_map.get(stat) for stat in [team_stats[0], team_stats[1], team_stats[2]]]

#Given a list of pokemon and the stats we are looking for it searches for the stats of each pokemon of the list using "all_stats"
#Returns the list of stats of the pokemon
def stats_pokemon(pokemon_array, team_stats):
  stats=[]
  for t in pokemon_array:
    s = all_stats(t, team_stats)
    stats.append(s)

  return stats

#Given a list of all the pokemons of a type and a list of all the stats of given pokemon
#Creates a dictionary that assigns each pokemon(key) their stats(values)
#Then sorts the keys deppending on their values considering each value in order (For example if the first value is the same it will compare the second)
#Returns the dictionary sorted from Highest stats to Lowest
def dictionary_sort(list_bicho, list_stats):
    i = 0
    dict_pokemon = {}

    for b in list_bicho:
      name = list_bicho[i]
      stat = list_stats[i]
      dict_pokemon[name] = [stat]
      i+=1

    sorted_pokemon = dict(sorted(
      dict_pokemon.items(),
      key=lambda item: (item[1][0][0], item[1][0][1], item[1][0][2]),
      reverse=True
    ))

    return sorted_pokemon

#Given a type of pokemon, we declare inside the stats needed to find a defensive team
#The previous functions are called and we keep a dictionary of the pokemons sorted to be in the defense team
#Returns the 6 highest ranking pokemon from the dictionary
def defensive_team(pokemon_type: str):
  team_stats = ["defense", "hp", "attack"]
  list_pokemon_type = type_selection(pokemon_type)
  list_stats = stats_pokemon(list_pokemon_type, team_stats)
  dictionary = dictionary_sort(list_pokemon_type, list_stats)

  defense_team = dict(list(dictionary.items())[:6])

  return defense_team

# EQUIPO OFENSIVO

In [19]:
#Given a type of pokemon all the moves avaible for that type are retrieved from the API
#Creates a dataframe with all the moves the given type could pottencially learn (not all pokemon from a type can learn all moves)
#Returns a dataframe with three columns (name of the move, power, base attack) sorted by highest power

#It is not being activelly used in the code as it takes a lot of time to upload all the moves and sometimes may cause overload for the API
#as a replacement an url with the moves extracted from this code is used
def all_type_power_moves(pokemon_type):
  pokemon_type = pokemon_type.lower().strip()

  names_list = []
  power_list = []
  base_attack_list = []

  response = requests.get("https://pokeapi.co/api/v2/move/?limit=500", timeout=10)
  all_moves = response.json()['results']

  for move in all_moves:
      try:
          r = requests.get(move['url'], timeout=10)
          datos = r.json()

          tipo = datos['type']['name']
          if tipo != pokemon_type:
              continue

          power = datos['power']
          if power is None:
              continue

          name = datos['name']
          base_attack = power

          names_list.append(name)
          power_list.append(power)
          base_attack_list.append(base_attack)

      except:
          continue

  df = pd.DataFrame({
      "nombre": names_list,
      "potencia": power_list,
      "ataque_base": base_attack_list
  })

  df = df.sort_values(by="potencia", ascending=False).reset_index(drop=True)

  return df

#Given a dataframe Returns from the first row the value in "potencia"
#This value is the power from the most powerful attack
def max_power(df):
    return int(df.loc[0, 'potencia'])

#Given a pokemon type and a value(max power)
#Selects the stats for all pokemons from given type using the given stats
#Multiplies the third stat with "m_power" to calculate power
#Returns a dictionary with the six best pokemon(keys) for an offensive team and their stats(values)
def offensive_team(pokemon_type: str, m_power):
  team_stats = ["special-attack", "attack", "special-attack"]
  list_pokemon_type = type_selection(pokemon_type)
  list_stats = stats_pokemon(list_pokemon_type, team_stats)

  for s in list_stats:
    aux_ba = s[2]
    s[2] = aux_ba * m_power

  dictionary = dictionary_sort(list_pokemon_type, list_stats)

  offensive_t = dict(list(dictionary.items())[:6])

  return offensive_t

#SINERGIA DE EQUIPOS

In [20]:
#Given a pokemon name it finds all the types it has
#Returns a list of their types
def get_types(pokemon):
  types=[]
  url=f'https://pokeapi.co/api/v2/pokemon/{pokemon}'
  response=requests.get(url).json()
  types1=response.get("types")
  for t in types1:
    types.append(t.get("type").get("name"))
  return types

#Given a list of pokemons it searches all of their types and adds it to a list
#Returns a list of pokemons and a list of their respective types
def max_types(pokemons):
    ntypes=[]

    for p in pokemons:
      types_p = get_types(p)
      ntypes.append((p, len(types_p), types_p))
    pokemons_sorted=sorted(ntypes, key=lambda x: x[1], reverse=True)

    bests= pokemons_sorted[:6]
    pokemons = [i[0] for i in bests]
    types = [i[2] for i in bests]
    return pokemons, types

#Using the defensive and offensive teams a list of pokemons is created of the Type given
#Then looks for the maximun types possible for the team using "max_types"
#Return a list of the final team and a list of their respective types
def synergy(pokemon_type, m_power, defensive, offensive):
  pokemon_list = list(dict.fromkeys(list(defensive.keys()) + list(offensive.keys())))

  team, types = max_types(pokemon_list)
  return team, types

# NLP y Clasificador

Vectorización de los dataframes:

In [21]:
#Given a text and a Spacy nlp it filters the words of the text finding non stop words, nouns, adjectives and alphabetical characters
#Returns a list of the lemmas from the most relevant words
def text_to_tokens(text, nlp):
  return [t.lemma_ for t in nlp(text) if not t.is_stop and t.pos_ in ["NOUN", "ADJ"] and t.is_alpha]


#Given a series of texts it applies "Text_to_tokens" to each text and adds it to a list of words
#Returns a dictionary that has each word and amount of times it appears
def count_words_in_series(series, nlp):
    words = []
    for text in series:
        words += text_to_tokens(text, nlp)
    return dict(Counter(words))

#Given a series of texts, an nlp and a minimun time of appereance for each word
#Returns a list of the most used words that will be used as vocabulary for the training
def get_vocabulary(series, nlp, min_app=3):
    words_dict = count_words_in_series(series, nlp)
    return [k for k, v in words_dict.items() if v >= min_app]

#Given a text, vocabulary and nlp
#Transforms the text in a vector using as reference the vocabulary
#Returns a list with the vectorized words (words as numeric value)
def text_to_vector(text, vectorize_words, nlp):
  words = text_to_tokens(text, nlp)
  counter = dict(Counter(words))

  vector = []
  for w in vectorize_words:
      vector.append(counter.get(w, 0))

  total = sum(vector)

  if total == 0:
      return vector

  return [x / total for x in vector]

#Given a Dataframe, nlp and a list of vocabulary
#Adapts the dataframe text to vectorized text using the vocabulary as reference
#Returns a dataframe of the previous one vectorized
def vectorize_dataframe(df, nlp, vocabulary):
  each_text_to_vector = [text_to_vector(t, vocabulary, nlp) for t in df["descripcion"]]
  new_df = pd.DataFrame({
      "clase": df["clase"]
  })
  for j, w in enumerate(vocabulary):
    new_df[w] = [each_text_to_vector[i][j] for i in range(len(df))]

  return new_df

Entrenamiento del modelo y clasificación:

In [22]:
#Training of a Knn model using a dataframe of moves, and the type (offensive/defensive) as target
#Returns trained model
def knn_model(movs_df):
  model = KNeighborsClassifier(n_neighbors=3)
  df_x = movs_df.drop(columns=['clase'])
  df_y = movs_df['clase']
  model.fit(df_x, df_y)
  print(f"Model's accuracy score: {model.score(df_x, df_y):.2f}")
  return model

#Given a model and a vectorized dataframe uses the prediction of the model
#Returns a dataframe like the one given with an aditional column that shows the predicted value
def classification(model, m_vector):
  df_test_x = m_vector.drop(columns=['clase'])
  df_test_y = m_vector['clase']
  predict = model.predict(df_test_x)
  df_result = m_vector.copy()
  df_result['clase_predicha'] = predict

  return df_result

#EQUIPO FINAL

Elección de movimientos de pokemon:

In [23]:
#Given a pokemon name it finds all the moves it can learn and sorts them based on power
#Takes the first defensive move found and the three hihgest ranking moves (offensive moves)
#Returns a dataframe with the four best moves
def choose_moves(pokemon):
  df_pokemon = obtener_movimientos_pokemon(pokemon)
  df_pokemon_sorted = df_pokemon.sort_values(by='potencia', ascending=False)
  defensive_move = df_pokemon[df_pokemon['clase'] == 0].head(1)

  new_df = df_pokemon_sorted.head(3).reset_index(drop=True)
  new_df = pd.concat([new_df, defensive_move], ignore_index=True)
  new_df = new_df.fillna(0)

  return new_df

Dataframe con las estadíticas del equipo final:


*   Nombre del Pokemon
*   Tipos
* Movimientos
* Clase Original del Movimiento
* Clase Predicha del Movimiento



In [24]:
#Given a synergy team, synergy types, knn model, vocabulary and nlp
#Creates a dataframe with a pokemon name, the types it has, the four best moves, the class of each move and the predicted class
#Returns the dataframe
def final_team(syn_team, syn_types, model, global_vocabulary, nlp):
  df_team = pd.DataFrame(columns=['Pokemon', 'Type', 'Moves', 'Original Class', 'Predicted Class'])
  df_team['Pokemon']=syn_team
  df_team['Type'] = [' and '.join(t) for t in syn_types]

  moves_list = []
  og_class_list = []
  pred_class_list = []

  for s in syn_team:
    pokemon_df = choose_moves(s)

    moves_txt = "\n".join(pokemon_df['nombre'].tolist())
    moves_list.append(moves_txt)

    og_class = pokemon_df['clase'].tolist()
    og_class_txt = "\n".join(["Defensive" if c == 0 else "Offensive" for c in og_class])
    og_class_list.append(og_class_txt)

    df_mov_vectorized = vectorize_dataframe(pokemon_df, nlp, global_vocabulary)
    df_model = classification(model, df_mov_vectorized)
    pred_class_txt = "\n".join(["Defensive" if c == 0 else "Offensive" for c in df_model['clase_predicha'].tolist()])
    pred_class_list.append(pred_class_txt)

  df_team['Moves'] = moves_list
  df_team['Original Class'] = og_class_list
  df_team['Predicted Class'] = pred_class_list

  final_team = df_team.style.set_properties(**{'white-space': 'pre-wrap'})

  return final_team



---


# MAIN


---



In [25]:
def main(schr_type, pwr_type, url_m, url_p, nlp):
  try:
    if pwr_type != 'bug':
      power_moves = all_type_power_moves(pwr_type)
    else:
      power_moves = pd.read_csv(url_p)

    pokemon_moves = pd.read_csv(url_m)
    p_move = max_power(power_moves)

    global_vocabulary = get_vocabulary(pokemon_moves["descripcion"], nlp, min_app=3)
    pokemon_moves_vectorized = vectorize_dataframe(pokemon_moves, nlp, global_vocabulary)
    model = knn_model(pokemon_moves_vectorized)

    defensive = defensive_team(schr_type)
    offensive = offensive_team(schr_type, p_move)

    df_defensive = pd.DataFrame([
        {'pokemon':k, 'defense':v[0][0], 'hp':v[0][1], 'attack':v[0][2]} for k, v in defensive.items()
    ])

    print("DEFENSIVE TEAM")
    display(df_defensive)

    df_offensive = pd.DataFrame([
        {'pokemon':k, 'special-attack':v[0][0], 'attack':v[0][1], 'power':v[0][2]} for k, v in offensive.items()
    ])
    print("OFFENSIVE TEAM")
    display(df_offensive)

    synergy_team, synergy_types = synergy(schr_type, p_move, defensive, offensive)
    team = final_team(synergy_team, synergy_types, model, global_vocabulary, nlp)

    return team

  except:
    print("Error: API requests have been overloaded")
    print("Last team build stored will be shown. This only works for BUG type pokemons.")

    print("DEFENSE TEAM")
    url_def = "https://raw.githubusercontent.com/lucia-pl/MT_Pokemon/refs/heads/docs/defense_team_bug.csv"
    bug_def = pd.read_csv(url_def)
    display(bug_def)

    print("OFFENSIVE TEAM")
    url_off = "https://raw.githubusercontent.com/lucia-pl/MT_Pokemon/refs/heads/docs/offensive_team_bug.csv"
    bug_off = pd.read_csv(url_off)
    display(bug_off)

    url_error = "https://raw.githubusercontent.com/lucia-pl/MT_Pokemon/refs/heads/docs/Final_team_BUG.csv"
    bug_error = pd.read_csv(url_error)
    bug_error_team = bug_error.style.set_properties(**{'white-space': 'pre-wrap'})

    return bug_error_team

In [28]:
team = main(searched_type, power_type, url_moves, url_power, nlp_en)
team

Model's accuracy score: 0.79
DEFENSIVE TEAM


Unnamed: 0,pokemon,defense,hp,attack
0,Shuckle,230,20,10
1,Forretress,140,75,90
2,Scizor,100,70,130
3,Pinsir,100,65,125
4,Pineco,90,50,65
5,Scyther,80,70,110


OFFENSIVE TEAM


Unnamed: 0,pokemon,special-attack,attack,power
0,Venomoth,90,65,10800
1,Butterfree,90,45,10800
2,Yanma,75,65,9000
3,Parasect,60,95,7200
4,Ariados,60,90,7200
5,Forretress,60,90,7200


Unnamed: 0,Pokemon,Type,Moves,Original Class,Predicted Class
0,Shuckle,bug and rock,steel-roller double-edge meteor-beam string-shot,Offensive Offensive Offensive Defensive,Offensive Offensive Offensive Defensive
1,Forretress,bug and steel,explosion self-destruct hyper-beam counter,Offensive Offensive Offensive Defensive,Offensive Offensive Defensive Defensive
2,Scizor,bug and steel,hyper-beam giga-impact steel-beam swords-dance,Offensive Offensive Offensive Defensive,Defensive Defensive Offensive Defensive
3,Scyther,bug and flying,hyper-beam giga-impact skull-bash swords-dance,Offensive Offensive Offensive Defensive,Defensive Defensive Offensive Defensive
4,Venomoth,bug and poison,hyper-beam giga-impact solar-beam whirlwind,Offensive Offensive Offensive Defensive,Defensive Defensive Offensive Defensive
5,Butterfree,bug and flying,hyper-beam giga-impact solar-beam whirlwind,Offensive Offensive Offensive Defensive,Defensive Defensive Offensive Defensive
