# Mutant Moneyball Dataset from Tidy Tuesday GitHub
- Purpose to practice my plotly, pandas, numpy, and sklearn skills (can also try statsmodels later for a more R like interface and statistical testing)

In [269]:
import numpy as np
import pandas as pd
import plotly.express as px
import sklearn as sk

In [270]:
# Read in data
mutants = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-03-19/mutant_moneyball.csv")

In [271]:
px.scatter(mutants, x='TotalIssues', y="TotalValue_heritage")

In [272]:
mutants.columns
mutants.isnull().sum()

Member                    0
TotalIssues               0
TotalIssues60s            0
TotalIssues70s            0
TotalIssues80s            0
TotalIssues90s            0
totalIssueCheck           0
TotalValue_heritage       0
TotalValue60s_heritage    0
TotalValue70s_heritage    0
TotalValue80s_heritage    0
TotalValue90s_heritage    0
TotalValue_ebay           0
TotalValue60s_ebay        0
TotalValue70s_ebay        0
TotalValue80s_ebay        0
TotalValue90s_ebay        0
60s_Appearance_Percent    0
70s_Appearance_Percent    0
80s_Appearance_Percent    0
90s_Appearance_Percent    0
PPI60s_heritage           0
PPI70s_heritage           0
PPI80s_heritage           0
PPI90s_heritage           0
PPI60s_ebay               0
PPI70s_ebay               0
PPI80s_ebay               0
PPI90s_ebay               0
TotalValue60s_wiz         0
TotalValue70s_wiz         0
TotalValue80s_wiz         0
TotalValue90s_wiz         0
TotalValue60s_oStreet     0
TotalValue70s_oStreet     0
TotalValue80s_oStree

In [273]:
mutants.drop('totalIssueCheck', axis=1, inplace=True)
member = mutants['Member']

In [274]:
mutants = mutants.replace({r'[%$]':''}, regex=True).apply(lambda x: pd.to_numeric(x) if type(x)=='object' else x)
mutants['Member'] = member
mutants.isna().sum()

Member                    0
TotalIssues               0
TotalIssues60s            0
TotalIssues70s            0
TotalIssues80s            0
TotalIssues90s            0
TotalValue_heritage       0
TotalValue60s_heritage    0
TotalValue70s_heritage    0
TotalValue80s_heritage    0
TotalValue90s_heritage    0
TotalValue_ebay           0
TotalValue60s_ebay        0
TotalValue70s_ebay        0
TotalValue80s_ebay        0
TotalValue90s_ebay        0
60s_Appearance_Percent    0
70s_Appearance_Percent    0
80s_Appearance_Percent    0
90s_Appearance_Percent    0
PPI60s_heritage           0
PPI70s_heritage           0
PPI80s_heritage           0
PPI90s_heritage           0
PPI60s_ebay               0
PPI70s_ebay               0
PPI80s_ebay               0
PPI90s_ebay               0
TotalValue60s_wiz         0
TotalValue70s_wiz         0
TotalValue80s_wiz         0
TotalValue90s_wiz         0
TotalValue60s_oStreet     0
TotalValue70s_oStreet     0
TotalValue80s_oStreet     0
TotalValue90s_oStree

In [275]:
subset = mutants.filter(regex='Total[^0-9]*$', axis = 1).copy()
subset['Member'] = mutants['Member']

In [276]:
subset = subset.apply(lambda x: pd.qcut(x, 4, labels=False) if np.issubdtype(x.dtype, np.number) else x).set_index("Member")
subset

Unnamed: 0_level_0,TotalIssues,TotalValue_heritage,TotalValue_ebay
Member,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
warrenWorthington,3,3,3
hankMcCoy,2,3,3
scottSummers,3,3,3
bobbyDrake,2,3,3
jeanGrey,3,3,3
alexSummers,1,1,1
lornaDane,1,2,2
ororoMunroe,3,2,2
kurtWagner,2,2,2
loganHowlett,3,2,2


In [277]:
px.bar(subset, x="TotalIssues", y="TotalValue_heritage", color="TotalValue_ebay")

Most of the cells above were doing some random stuff, but from now on I have one clear task. I want to perform unsupervised learning to learn the innate patterns of the data. I also want to use a hierarchical index to have each character and decade (or total) as another index with each stat then corresponding to that grouping.

In [278]:
# First task: forming hierarchical indexed data with each character and decade/total as a category
mutants.columns


Index(['Member', 'TotalIssues', 'TotalIssues60s', 'TotalIssues70s',
       'TotalIssues80s', 'TotalIssues90s', 'TotalValue_heritage',
       'TotalValue60s_heritage', 'TotalValue70s_heritage',
       'TotalValue80s_heritage', 'TotalValue90s_heritage', 'TotalValue_ebay',
       'TotalValue60s_ebay', 'TotalValue70s_ebay', 'TotalValue80s_ebay',
       'TotalValue90s_ebay', '60s_Appearance_Percent',
       '70s_Appearance_Percent', '80s_Appearance_Percent',
       '90s_Appearance_Percent', 'PPI60s_heritage', 'PPI70s_heritage',
       'PPI80s_heritage', 'PPI90s_heritage', 'PPI60s_ebay', 'PPI70s_ebay',
       'PPI80s_ebay', 'PPI90s_ebay', 'TotalValue60s_wiz', 'TotalValue70s_wiz',
       'TotalValue80s_wiz', 'TotalValue90s_wiz', 'TotalValue60s_oStreet',
       'TotalValue70s_oStreet', 'TotalValue80s_oStreet',
       'TotalValue90s_oStreet', 'PPI60s_wiz', 'PPI70s_wiz', 'PPI80s_wiz',
       'PPI90s_wiz', 'PPI60s_oStreet', 'PPI70s_oStreet', 'PPI80s_oStreet',
       'PPI90s_oStreet'],
      dtype

Distinct column vals: TotalIssues, TotalValue_heritage, TotalValue_ebay, Appearance_Percent, PPI_heritage, PPI_ebay, TotalValue_wiz, TotalValue_oStreet, PPI_wiz, PPI_oStreet

In [279]:
decades = pd.Series(["60s", "70s", "80s", "90s"])
h_index = pd.MultiIndex.from_product([mutants['Member'], decades])
h_index

newmutants = pd.DataFrame(
    {
        "issues": np.arange(104),
        "heritage_value": np.arange(104),
        "ebay_value": np.arange(104),
        "appearance_percent": np.arange(104),
        "ppi_heritage": np.arange(104),
        "ppi_ebay": np.arange(104),
        "wiz_value": np.arange(104),
        "ostreet_value": np.arange(104),
        "ppi_wiz": np.arange(104),
        "ppi_ostreet": np.arange(104),
    }, index=h_index
)

newmutants.index.names = ['Member', 'Decade']
for member in mutants['Member']:
    for decade in decades:
        decade_cols = [col for col in mutants.columns if decade in col]
        if decade_cols:
            newmutants.loc[(member, decade), 'issues'] = mutants.loc[mutants['Member'] == member, f'TotalIssues{decade}'].values[0]
            newmutants.loc[(member, decade), 'heritage_value'] = mutants.loc[mutants['Member'] == member, f'TotalValue{decade}_heritage'].values[0]
            newmutants.loc[(member, decade), 'ebay_value'] = mutants.loc[mutants['Member'] == member, f'TotalValue{decade}_ebay'].values[0]
            newmutants.loc[(member, decade), 'appearance_percent'] = mutants.loc[mutants['Member'] == member, f'{decade}_Appearance_Percent'].values[0]
            newmutants.loc[(member, decade), 'ppi_heritage'] = mutants.loc[mutants['Member'] == member, f'PPI{decade}_heritage'].values[0]
            newmutants.loc[(member, decade), 'ppi_ebay'] = mutants.loc[mutants['Member'] == member, f'PPI{decade}_ebay'].values[0]
            newmutants.loc[(member, decade), 'wiz_value'] = mutants.loc[mutants['Member'] == member, f'TotalValue{decade}_wiz'].values[0]
            newmutants.loc[(member, decade), 'ostreet_value'] = mutants.loc[mutants['Member'] == member, f'TotalValue{decade}_oStreet'].values[0]
            newmutants.loc[(member, decade), 'ppi_wiz'] = mutants.loc[mutants['Member'] == member, f'PPI{decade}_wiz'].values[0]
            newmutants.loc[(member, decade), 'ppi_ostreet'] = mutants.loc[mutants['Member'] == member, f'PPI{decade}_oStreet'].values[0]

newmutants.isnull().sum()


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '96.83' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '15,230.43 ' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '382.54 ' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '7,913.00 ' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '68,160.00 ' has dtype incompatible

issues                0
heritage_value        0
ebay_value            0
appearance_percent    0
ppi_heritage          0
ppi_ebay              0
wiz_value             0
ostreet_value         0
ppi_wiz               0
ppi_ostreet           0
dtype: int64

In [334]:
xmen = newmutants.reset_index().set_index('Member').loc[['charlesXavier', 'scottSummers','ororoMunroe', 'jeanGrey', 'loganHowlett']]
px.line(xmen.reset_index(), x="Decade", y="issues", color='Member')

In [348]:
px.funnel(xmen.reset_index(), x="Decade", y="issues", color="Member", title='Comic Issues Sold by Decade')