# **Procesamiento de datos**

## **1. Recopilación de datos**



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/mn-ariel/course_machine_learning/main/data%20processing/dataset/pokemon.csv')

In [3]:
df.head()

Unnamed: 0,Name,Variation,Type1,Type2,Total,HP,Attack,Defense,Sp.Atk,Sp.Def,Speed,Attack.especial
0,Bulbasaur,,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,,Fire,,309.0,39.0,52.0,43,60,50,65.0,1.0


## **2. Limpieza de datos**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1071 entries, 0 to 1070
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             1071 non-null   object 
 1   Variation        197 non-null    object 
 2   Type1            1071 non-null   object 
 3   Type2            579 non-null    object 
 4   Total            964 non-null    float64
 5   HP               964 non-null    float64
 6   Attack           927 non-null    float64
 7   Defense          1071 non-null   int64  
 8   Sp.Atk           1071 non-null   int64  
 9   Sp.Def           1071 non-null   int64  
 10  Speed            940 non-null    float64
 11  Attack.especial  317 non-null    float64
dtypes: float64(5), int64(3), object(4)
memory usage: 100.5+ KB


In [5]:
df.columns

Index(['Name', 'Variation', 'Type1', 'Type2', 'Total', 'HP', 'Attack',
       'Defense', 'Sp.Atk', 'Sp.Def', 'Speed', 'Attack.especial'],
      dtype='object')

In [6]:
def format_columns(df):
    df.columns = df.columns.str.lower().str.replace(" ","_", regex=True)
    df.columns = df.columns.str.replace(".","_", regex=True)
    df.columns = df.columns.str.replace("-","_", regex=True)
    df.columns = df.columns.str.replace(":","_", regex=True)

format_columns(df)
df.columns

Index(['name', 'variation', 'type1', 'type2', 'total', 'hp', 'attack',
       'defense', 'sp_atk', 'sp_def', 'speed', 'attack_especial'],
      dtype='object')

In [7]:
df.isnull().any()

name               False
variation           True
type1              False
type2               True
total               True
hp                  True
attack              True
defense            False
sp_atk             False
sp_def             False
speed               True
attack_especial     True
dtype: bool

In [8]:
df.isna().sum()

name                 0
variation          874
type1                0
type2              492
total              107
hp                 107
attack             144
defense              0
sp_atk               0
sp_def               0
speed              131
attack_especial    754
dtype: int64

In [9]:
def null_column(cols, quantity):
    for col in cols:
        if df[col].dtype != 'O':
            print('Columna',col,'de tipo CONTINUO y con valores NULOS de:',(df[col].isna().sum()),'que es', ((df[col].isna().sum())/quantity)*100,'%')
        else:
            print('Columna',col,'de tipo CATEGORICO y con valores NULOS de:',(df[col].isna().sum()),'que es', ((df[col].isna().sum())/quantity)*100,'%')

In [10]:
null_cols = df.columns[df.isna().any()].tolist()
quantity_values = len(df.index)
null_column(null_cols, quantity_values)

Columna variation de tipo CATEGORICO y con valores NULOS de: 874 que es 81.60597572362278 %
Columna type2 de tipo CATEGORICO y con valores NULOS de: 492 que es 45.938375350140056 %
Columna total de tipo CONTINUO y con valores NULOS de: 107 que es 9.990662931839402 %
Columna hp de tipo CONTINUO y con valores NULOS de: 107 que es 9.990662931839402 %
Columna attack de tipo CONTINUO y con valores NULOS de: 144 que es 13.445378151260504 %
Columna speed de tipo CONTINUO y con valores NULOS de: 131 que es 12.231559290382819 %
Columna attack_especial de tipo CONTINUO y con valores NULOS de: 754 que es 70.4014939309057 %


In [11]:
df.head(20)

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,Bulbasaur,,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,,Fire,,309.0,39.0,52.0,43,60,50,65.0,1.0
5,Charmeleon,,Fire,,405.0,58.0,64.0,58,80,65,80.0,1.0
6,Charizard,,Fire,Flying,534.0,78.0,,78,109,85,100.0,1.0
7,Charizard,Mega X,Fire,Dragon,634.0,78.0,130.0,111,130,85,100.0,1.0
8,Charizard,Mega Y,Fire,Flying,634.0,78.0,104.0,78,159,115,100.0,0.0
9,Squirtle,,Water,,314.0,44.0,48.0,65,50,64,43.0,


### Analizar la columna **total,** **hp,** **attack,** **speed**

In [12]:
df.isna().sum()
#df.isnull().any()

name                 0
variation          874
type1                0
type2              492
total              107
hp                 107
attack             144
defense              0
sp_atk               0
sp_def               0
speed              131
attack_especial    754
dtype: int64

In [13]:
def fill_na_with_median(df, columns):
    for column in columns:
        median_value = df[column].median()
        df[column].fillna(median_value, inplace=True)
    return df

In [14]:
cols_to_fill = ['hp', 'attack', 'speed']
fill_na_with_median(df, cols_to_fill)

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,Bulbasaur,,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,,Fire,,309.0,39.0,52.0,43,60,50,65.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Basculin,White-Striped,Water,,460.0,70.0,92.0,65,80,55,98.0,
1067,Basculegion,Male,Water,Ghost,530.0,120.0,112.0,65,80,75,78.0,
1068,Basculegion,Female,Water,Ghost,530.0,120.0,92.0,65,100,75,78.0,
1069,Kleavor,,Bug,Rock,505.0,70.0,135.0,95,45,75,85.0,


In [15]:
df.isna().sum()

name                 0
variation          874
type1                0
type2              492
total              107
hp                   0
attack               0
defense              0
sp_atk               0
sp_def               0
speed                0
attack_especial    754
dtype: int64

In [16]:
def calculate_total(row):
    if pd.isna(row['total']):
        return row[['hp', 'attack', 'defense', 'sp_atk', 'sp_def', 'speed']].sum()
    else:
        return row['total']

In [17]:
df['total'] = df.apply(calculate_total, axis=1)

In [18]:
df.isna().sum()

name                 0
variation          874
type1                0
type2              492
total                0
hp                   0
attack               0
defense              0
sp_atk               0
sp_def               0
speed                0
attack_especial    754
dtype: int64

### Analizar la columna **attack_especial**

In [21]:
df['attack_especial'] = df['attack_especial'].fillna(0)

In [22]:
df.head(10)

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,Bulbasaur,,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,,Fire,,309.0,39.0,52.0,43,60,50,65.0,1.0
5,Charmeleon,,Fire,,405.0,58.0,64.0,58,80,65,80.0,1.0
6,Charizard,,Fire,Flying,534.0,78.0,80.0,78,109,85,100.0,1.0
7,Charizard,Mega X,Fire,Dragon,634.0,78.0,130.0,111,130,85,100.0,1.0
8,Charizard,Mega Y,Fire,Flying,634.0,78.0,104.0,78,159,115,100.0,0.0
9,Squirtle,,Water,,314.0,44.0,48.0,65,50,64,43.0,0.0


In [23]:
df.isna().sum()

name                 0
variation          874
type1                0
type2              492
total                0
hp                   0
attack               0
defense              0
sp_atk               0
sp_def               0
speed                0
attack_especial      0
dtype: int64

### Analizar la columna **variation** y **type2**

In [24]:
df['variation'] = df['variation'].fillna('not Mega')

In [25]:
df['type2'] = df['type2'].fillna('not')

In [27]:
df.head(20)

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,Bulbasaur,not Mega,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,not Mega,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,not Mega,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,not Mega,Fire,not,309.0,39.0,52.0,43,60,50,65.0,1.0
5,Charmeleon,not Mega,Fire,not,405.0,58.0,64.0,58,80,65,80.0,1.0
6,Charizard,not Mega,Fire,Flying,534.0,78.0,80.0,78,109,85,100.0,1.0
7,Charizard,Mega X,Fire,Dragon,634.0,78.0,130.0,111,130,85,100.0,1.0
8,Charizard,Mega Y,Fire,Flying,634.0,78.0,104.0,78,159,115,100.0,0.0
9,Squirtle,not Mega,Water,not,314.0,44.0,48.0,65,50,64,43.0,0.0


In [61]:
df.isna().sum()

name               0
variation          0
type1              0
type2              0
total              0
hp                 0
attack             0
defense            0
sp_atk             0
sp_def             0
speed              0
attack_especial    0
dtype: int64

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1071 entries, 0 to 1070
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             1071 non-null   object 
 1   variation        1071 non-null   object 
 2   type1            1071 non-null   object 
 3   type2            1071 non-null   object 
 4   total            1071 non-null   float64
 5   hp               1071 non-null   float64
 6   attack           1071 non-null   float64
 7   defense          1071 non-null   int64  
 8   sp_atk           1071 non-null   int64  
 9   sp_def           1071 non-null   int64  
 10  speed            1071 non-null   float64
 11  attack_especial  1071 non-null   float64
dtypes: float64(5), int64(3), object(4)
memory usage: 100.5+ KB


## **3. Transformación de datos**

In [29]:
def get_categorical_columns(df):
    categorical_columns = df.select_dtypes(include=['object']).columns

    return list(categorical_columns)

In [30]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
print(categorical_columns)

Index(['name', 'variation', 'type1', 'type2'], dtype='object')


In [31]:
df.head()

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,Bulbasaur,not Mega,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,not Mega,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,not Mega,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,not Mega,Fire,not,309.0,39.0,52.0,43,60,50,65.0,1.0


In [32]:
df_1 = df.copy()
df_2 = df.copy()

In [33]:
df_1

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,Bulbasaur,not Mega,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,not Mega,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,not Mega,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,not Mega,Fire,not,309.0,39.0,52.0,43,60,50,65.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Basculin,White-Striped,Water,not,460.0,70.0,92.0,65,80,55,98.0,0.0
1067,Basculegion,Male,Water,Ghost,530.0,120.0,112.0,65,80,75,78.0,0.0
1068,Basculegion,Female,Water,Ghost,530.0,120.0,92.0,65,100,75,78.0,0.0
1069,Kleavor,not Mega,Bug,Rock,505.0,70.0,135.0,95,45,75,85.0,0.0


In [34]:
df_2

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,Bulbasaur,not Mega,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,not Mega,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,not Mega,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,not Mega,Fire,not,309.0,39.0,52.0,43,60,50,65.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Basculin,White-Striped,Water,not,460.0,70.0,92.0,65,80,55,98.0,0.0
1067,Basculegion,Male,Water,Ghost,530.0,120.0,112.0,65,80,75,78.0,0.0
1068,Basculegion,Female,Water,Ghost,530.0,120.0,92.0,65,100,75,78.0,0.0
1069,Kleavor,not Mega,Bug,Rock,505.0,70.0,135.0,95,45,75,85.0,0.0


In [35]:
def convert_categorical_to_continuous(df, column_name):

    for i in column_name :
        df[i] = df[i].astype('category').cat.codes

    return df

In [36]:
df_1 = convert_categorical_to_continuous(df_1, categorical_columns)
df_1.head()

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,83,87,10,13,318.0,45.0,49.0,49,65,65,45.0,1.0
1,373,87,10,13,405.0,60.0,62.0,63,80,80,60.0,0.0
2,840,87,10,13,525.0,80.0,82.0,83,100,100,80.0,1.0
3,840,43,10,13,625.0,80.0,100.0,123,122,120,80.0,0.0
4,108,87,7,18,309.0,39.0,52.0,43,60,50,65.0,1.0


In [37]:
df_2.head()

Unnamed: 0,name,variation,type1,type2,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial
0,Bulbasaur,not Mega,Grass,Poison,318.0,45.0,49.0,49,65,65,45.0,1.0
1,Ivysaur,not Mega,Grass,Poison,405.0,60.0,62.0,63,80,80,60.0,0.0
2,Venusaur,not Mega,Grass,Poison,525.0,80.0,82.0,83,100,100,80.0,1.0
3,Venusaur,Mega,Grass,Poison,625.0,80.0,100.0,123,122,120,80.0,0.0
4,Charmander,not Mega,Fire,not,309.0,39.0,52.0,43,60,50,65.0,1.0


In [38]:
def convert_categorical_dummies(df, column_name):

    one_hot = pd.get_dummies(df[column_name], prefix=column_name)
    df = df.drop(column_name, axis=1)
    df = pd.concat([df, one_hot], axis=1)

    return df

In [39]:
convert_categorical_dummies(df_2, categorical_columns)

Unnamed: 0,total,hp,attack,defense,sp_atk,sp_def,speed,attack_especial,name_Abomasnow,name_Abra,...,type2_Grass,type2_Ground,type2_Ice,type2_Normal,type2_Poison,type2_Psychic,type2_Rock,type2_Steel,type2_Water,type2_not
0,318.0,45.0,49.0,49,65,65,45.0,1.0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,405.0,60.0,62.0,63,80,80,60.0,0.0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,525.0,80.0,82.0,83,100,100,80.0,1.0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,625.0,80.0,100.0,123,122,120,80.0,0.0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,309.0,39.0,52.0,43,60,50,65.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,460.0,70.0,92.0,65,80,55,98.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
1067,530.0,120.0,112.0,65,80,75,78.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1068,530.0,120.0,92.0,65,100,75,78.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1069,505.0,70.0,135.0,95,45,75,85.0,0.0,0,0,...,0,0,0,0,0,0,1,0,0,0


## **4. Integración de Datos**

## **5. Reducción de Datos**

## **6. Partición de Datos**

In [75]:
X = df.drop(['attack'], axis = 1)
y = df['attack']

In [76]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 1)

## **7. Exploración de Datos**