# Applying Advanced Transformations
- Michael Vincent
- 9/20

## Imports

In [1]:
# Imports
import numpy as np
import pandas as pd
import json

## Load the data

In [2]:
# Load the data
info = pd.read_csv('superhero_info - superhero_info.csv')
powers = pd.read_csv('superhero_powers - superhero_powers.csv')

display(info.head(), powers.head())

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


## Clean the data

In [3]:
# Split the Hero|Publisher column
info[['Hero', 'Publisher']] = info['Hero|Publisher'].str.split('|',
                                                               expand = True)
info.drop(columns = 'Hero|Publisher', inplace = True)
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


In [4]:
# Clean the measurements column

# Replace the single quote with a double quote so we can use json.loads
info['Measurements'] = info['Measurements'].str.replace("'", '"')

# Convert the strings to dictionaries
info['Measurements'] = info['Measurements'].apply(json.loads)

info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


In [5]:
# Break the measurement column into a data frame with two columns
height_weight = info['Measurements'].apply(pd.Series)
height_weight.head()

# Concatanate the height and weight columns back into the data frame
info = pd.concat([info, height_weight], axis = 1)

# Remove the Measurements column
info.drop(columns = 'Measurements', inplace = True)

info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0 cm,90.0 kg
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0 cm,441.0 kg
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0 cm,122.0 kg


In [6]:
# Remove the units from height and weight and convert the columns to floats
info['Height'] = info['Height'].str.replace(' cm', '').astype(float)
info['Weight'] = info['Weight'].str.replace(' kg', '').astype(float)

info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      463 non-null    object 
 1   Race        463 non-null    object 
 2   Alignment   463 non-null    object 
 3   Hair color  463 non-null    object 
 4   Eye color   463 non-null    object 
 5   Skin color  463 non-null    object 
 6   Hero        463 non-null    object 
 7   Publisher   463 non-null    object 
 8   Height      463 non-null    float64
 9   Weight      463 non-null    float64
dtypes: float64(2), object(8)
memory usage: 36.3+ KB


In [7]:
# Get all the powers and store them in a list of columns to make
#powers['Powers'] = powers['Powers'].str.split(',')
cols_to_make = powers['Powers'].str.split(',').explode().value_counts().index
cols_to_make

Index(['Super Strength', 'Stamina', 'Durability', 'Super Speed', 'Agility',
       'Flight', 'Accelerated Healing', 'Reflexes', 'Intelligence',
       'Energy Blasts',
       ...
       'Omnitrix', 'Thirstokinesis', 'Anti-Gravity', 'Hyperkinesis',
       'Speed Force', 'Electrical Transport', 'Molecular Dissipation',
       'Banish', 'Biokinesis', 'Changing Armor'],
      dtype='object', length=167)

In [8]:
# One hot encode the powers
for col in cols_to_make:
    powers[col] = powers['Powers'].str.contains(col)
powers.head()

  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'

  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)


Unnamed: 0,hero_names,Powers,Super Strength,Stamina,Durability,Super Speed,Agility,Flight,Accelerated Healing,Reflexes,...,Omnitrix,Thirstokinesis,Anti-Gravity,Hyperkinesis,Speed Force,Electrical Transport,Molecular Dissipation,Banish,Biokinesis,Changing Armor
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",True,True,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",True,True,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...",True,True,True,False,True,False,True,True,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,Lantern Power Ring,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt...",True,True,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


## Merge the data frames

In [9]:
# Merge the data frames
df = pd.merge(info, powers, left_on = 'Hero', right_on = 'hero_names')
df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,...,Omnitrix,Thirstokinesis,Anti-Gravity,Hyperkinesis,Speed Force,Electrical Transport,Molecular Dissipation,Banish,Biokinesis,Changing Armor
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0,...,False,False,False,False,False,False,False,False,False,False
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0,...,False,False,False,False,False,False,False,False,False,False
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0,...,False,False,False,False,False,False,False,False,False,False


## Questions

In [21]:
# Compare the weights of heroes with and without super speed
filter = df['Super Speed']
aw_w_ss = round(df.loc[filter, 'Weight'].mean(), 2)
aw_wo_ss = round(df.loc[~filter, 'Weight'].mean(), 2)

print(f'The heroes with super speed had an average weight of {aw_w_ss}.')
print(f'The heroes without super speed had an average weight of {aw_wo_ss}.')

The heroes with super speed had an average weight of 129.4.
The heroes without super speed had an average weight of 101.77.


In [26]:
# Compare the average heights of the heroes by publisher
df.groupby('Publisher')['Height'].mean().round(2).sort_values(ascending = False)

Publisher
Image Comics         211.00
Marvel Comics        191.55
DC Comics            181.92
Star Trek            181.50
Team Epic TV         180.75
Unknown              178.00
Dark Horse Comics    176.91
Shueisha             171.50
George Lucas         159.60
Name: Height, dtype: float64