In [323]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

Data set source: https://www.kaggle.com/datasets/jonathanbesomi/superheroes-nlp-dataset
Note that I did some of the data cleaning in Excel before importing the file into the Python project, so you need to load the Excel file provided, as the one downloaded directly from Kaggle won't be properly cleaned by the code below, as I wrote the code specifically for the already partially cleaned version of the data.

To be specific as to how I modified the Excel file prior to importing it:
In no particular order,
1. I filled in many of the missing values in the gender column, specifically the ones where the name made the gender
   obvious and/or I was familiar with the character
2. I filled in some of the missing values for race and alignment, going by either personal knowledge of particular
   characters, or by Googling the character
3. Similarly for the creator column
4. I replaced both the cells containing only "-" and the empty cells with cells containing "NaN".
5. I put both the weight and height data into common units, i.e. I converted any meter measurements to cm and any ton measurements to kg.
6. I deleted several of the rows that randomly contained character descriptions and no variable values, as well as empty
   rows
7. I deleted a few rows that contained very little information (i.e. most of the cells were blank)


Read in and clean data

In [None]:
dropRows = [55, 76, 78, 81, 410, 481, 486, 490, 645, 653, 657, 693,
            697, 840, 827, 830, 842, 844, 949, 950, 1059, 1063, 1077, 1079, 1081]
#original data set had nonsense rows

superheroData = \
    pd.read_excel("Superhero data.xlsx").\
    drop(dropRows).\
    drop(["full_name"], axis = 1).\
    dropna(how = "all").\
    reset_index()#.\
    #drop(["index"], axis = 1)

#Fix format of height and weight variables:
columnLength = 1408

for i in range(0, columnLength-1): #final row already has data in the correct form
        #height data is of the form "6'8 â€¢ 203 cm" which would be very difficult to work with
        #hence I'm extracting just the value in centimeters and converting to a float
        #similar for weight

        if not(str(superheroData["height"][i]) == "nan"):
            try:
                superheroData["height"][i] = float(str(superheroData["height"][i]).split()[2])
            except:
                superheroData["height"][i] = str(superheroData["height"][i]).replace(',', '')
                superheroData["height"][i] = float(superheroData["height"][i].split()[2])

        if not(str(superheroData["weight"][i]) == "nan"):
            try:
                superheroData["weight"][i] = float(str(superheroData["weight"][i]).split()[3])
            except:
                superheroData["weight"][i] = str(superheroData["weight"][i]).replace(',', '')
                superheroData["weight"][i] = float(superheroData["weight"][i].split()[3])

superheroData

Impute missing data values:

In [330]:
superheroData.isnull().sum()

index                   0
name                    0
real_name             137
intelligence_score     16
strength_score         16
                     ... 
has_super_speed        78
has_durability         78
has_stamina            78
has_agility            78
has_super_strength     78
Length: 68, dtype: int64

In [None]:
quantitativeVariables = superheroData.iloc[:, [3, 4, 5, 6, 7, 8, 14, 15]]

knnImputer = KNNImputer(n_neighbors = 10)

quantitativeVariablesFilled = \
    pd.DataFrame(knnImputer.fit_transform(quantitativeVariables)).\
    rename(columns = {0:"intelligence score", 1:"strength score",
                      2:"speed score", 3:"durability score",
                      4:"power score", 5: "combat score",
                      6:"height (cm)", 7:"weight (kg)" })

quantitativeVariablesFilled

In [327]:
temp = [i for i in range(16, 68)]

categoricalVariables = superheroData.iloc[:, [1, 2, 9, 10, 11, 12, 13] + temp]

simpleImputer = SimpleImputer(strategy = "most_frequent")

categoricalVariablesFilled = pd.DataFrame(simpleImputer.fit_transform(categoricalVariables))

categoricalVariablesFilled.columns = categoricalVariables.columns

categoricalVariablesFilled

Unnamed: 0,name,real_name,superpowers,creator,alignment,gender,type_race,eye_color,hair_color,has_electrokinesis,...,has_flight,has_accelerated_healing,has_weapons_master,has_intelligence,has_reflexes,has_super_speed,has_durability,has_stamina,has_agility,has_super_strength
0,3-D Man,"Delroy Garrett, Jr.","['Super Speed', 'Super Strength']",Marvel Comics,Good,Male,Human,Blue,Black,0,...,0,0,0,0,0,1,0,0,0,1
1,514A (Gotham),Bruce Wayne,"['Durability', 'Reflexes', 'Super Strength']",DC Comics,Good,Male,Human,Blue,Black,0,...,0,0,0,0,1,0,1,0,0,1
2,A-Bomb,Richard Milhouse Jones,"['Accelerated Healing', 'Agility', 'Berserk Mo...",Marvel Comics,Good,Male,Human,Yellow,No Hair,0,...,0,1,0,0,1,1,1,1,1,1
3,Aa,Aa,"['Energy Absorption', 'Energy Armor', 'Energy ...",DC Comics,Good,Male,Human,Blue,Black,0,...,0,0,0,0,0,0,0,0,0,0
4,Aaron Cash,Aaron Cash,"['weapon based Powers', 'Weapons Master']",DC Comics,Good,Male,Human,Blue,Black,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,Zane,Zane,"['Agility', 'Animal Control', 'Cold Resistance...",Marvel Comics,Good,Male,Android,Blue,White,0,...,1,0,1,1,1,0,1,1,1,1
1404,Zatanna,Zatanna Zatara,"['Cryokinesis', 'Fire Control', 'Magic', 'Prob...",DC Comics,Good,Female,Human,Blue,Black,0,...,0,0,0,0,0,0,0,0,0,0
1405,Zoom (New 52),Hunter Zolomon,"['Accelerated Healing', 'Agility', 'Durability...",DC Comics,Bad,Male,Metahuman,Red,Brown,1,...,0,1,0,1,1,1,1,1,1,1
1406,Zoom,Hunter Zolomon,"['Intangibility', 'Super Speed', 'Time Manipul...",DC Comics,Bad,Male,Metahuman,Red,Brown,0,...,0,0,0,0,0,1,0,0,0,0


In [329]:
superheroDataFilled = pd.concat([categoricalVariablesFilled, quantitativeVariables], axis = 1)
superheroDataFilled

Unnamed: 0,name,real_name,superpowers,creator,alignment,gender,type_race,eye_color,hair_color,has_electrokinesis,...,has_agility,has_super_strength,intelligence_score,strength_score,speed_score,durability_score,power_score,combat_score,height,weight
0,3-D Man,"Delroy Garrett, Jr.","['Super Speed', 'Super Strength']",Marvel Comics,Good,Male,Human,Blue,Black,0,...,0,1,85,30,60,60,40,70,,
1,514A (Gotham),Bruce Wayne,"['Durability', 'Reflexes', 'Super Strength']",DC Comics,Good,Male,Human,Blue,Black,0,...,0,1,100,20,30,50,35,100,,
2,A-Bomb,Richard Milhouse Jones,"['Accelerated Healing', 'Agility', 'Berserk Mo...",Marvel Comics,Good,Male,Human,Yellow,No Hair,0,...,1,1,80,100,80,100,100,80,203.0,441.0
3,Aa,Aa,"['Energy Absorption', 'Energy Armor', 'Energy ...",DC Comics,Good,Male,Human,Blue,Black,0,...,0,0,80,50,55,45,100,55,,
4,Aaron Cash,Aaron Cash,"['weapon based Powers', 'Weapons Master']",DC Comics,Good,Male,Human,Blue,Black,0,...,0,0,80,10,25,40,30,50,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,Zane,Zane,"['Agility', 'Animal Control', 'Cold Resistance...",Marvel Comics,Good,Male,Android,Blue,White,0,...,1,1,100,35,50,60,100,100,168.0,45.0
1404,Zatanna,Zatanna Zatara,"['Cryokinesis', 'Fire Control', 'Magic', 'Prob...",DC Comics,Good,Female,Human,Blue,Black,0,...,0,0,90,10,25,30,100,55,170.0,57.0
1405,Zoom (New 52),Hunter Zolomon,"['Accelerated Healing', 'Agility', 'Durability...",DC Comics,Bad,Male,Metahuman,Red,Brown,1,...,1,1,95,50,100,75,100,80,185.0,81.0
1406,Zoom,Hunter Zolomon,"['Intangibility', 'Super Speed', 'Time Manipul...",DC Comics,Bad,Male,Metahuman,Red,Brown,0,...,0,0,75,10,100,30,100,30,185.0,81.0


In [334]:
list(superheroDataFilled["superpowers"][1])

['[',
 "'",
 'D',
 'u',
 'r',
 'a',
 'b',
 'i',
 'l',
 'i',
 't',
 'y',
 "'",
 ',',
 ' ',
 "'",
 'R',
 'e',
 'f',
 'l',
 'e',
 'x',
 'e',
 's',
 "'",
 ',',
 ' ',
 "'",
 'S',
 'u',
 'p',
 'e',
 'r',
 ' ',
 'S',
 't',
 'r',
 'e',
 'n',
 'g',
 't',
 'h',
 "'",
 ']']