In [None]:
import pandas as pd
from sqlite3 import connect
pd.set_option('mode.chained_assignment', None) # gets rid of an unhelpful SettingWithCopyWarning later on down the line.

In [None]:
# This results.csv file to be cleaned and analyzed here was compiled by Mikhail Zhilkin and is hosted at https://data.world/cervus/sumo-japan
results = pd.read_csv("data/results.csv", dtype={'basho': 'string', 'day': 'string', 'rikishi1_id': 'int64', 'rikishi1_rank': 'string', 'rikishi1_shikona': 'string', 'rikishi1_result': 'string', 'rikishi1_win': 'int64', 'kimarite': 'string', 'rikishi2_id': 'int64', 'rikishi2_rank': 'string', 'rikishi2_shikona': 'string', 'rikishi2_result': 'string', 'rikishi2_win': 'string'})
results

In [None]:
# Sumo wrestlers in these datasets can best be identified by their unique ID numbers. Although each wrestler has his own name (shikona), some wrestlers
# may reuse the same name, and other wrestlers may change names during their career. Since the ID number is the one identifier that remains stable,
# this is the best identifier to which I will anchor my analysis. Here I make a quick list of unique ID numbers for later use.

rikishi_id_list = []

for i in results["rikishi1_id"].unique():
    rikishi_id_list.append(i)

# Next to make a dictionary that holds each wrestler's unique ID number as a key, with values being each name associated with that wrestler.
# The vast majority of ID numbers will only feature one name, but this will make sure to include any additional names as well.

rikishi_id_dict = {}

# Initializes a key for every number sumo["rikishi1_id"] but still only adds one shikona (wrestler name) as a value per key, overwriting old shikona.
# So while this fills out the dictionary, it remains incomplete.
for i in range(len(results)):
    rikishi_id_dict.update({results["rikishi1_id"][i]: [results["rikishi1_shikona"][i]]})

# This fills out the dictionary with all associated names.
# Easy to test on rikishi_id_dict[1111], which shows both names for this wrestler, "Ama" and "Harumafuji."
# The first entry in every names list is the *most recent* name the wrestler used. So rikishi_id_dict[1111][0] shows "Harumafuji."
# This is useful to know, because when referring to a wrestler in the present day, one generally refers to them by their most current name.
# So when we reference this dictionary to indicate a wrestler's name, we will go by their most recent name, even if they have used others in the past.
for index, row in results.iterrows():
    if row["rikishi1_shikona"] not in rikishi_id_dict[row["rikishi1_id"]]:
        rikishi_id_dict[row["rikishi1_id"]].append(row["rikishi1_shikona"])

rikishi_id_dict

In [None]:
# Now to creates a dataframe using this dictionary, listing each wrestler's ID with any shikona (name) associated with that wrestler.
# As mentioned in the previous cell, the first value in the shikona list is the most recent name the wrestler has used,
# which is why I'm making it the default with the column "shikona," while older names are listed as "alt_shikona."
shikona_df = pd.DataFrame.from_dict([([key] + value) for key, value in rikishi_id_dict.items()])
shikona_df.rename(columns={0: "id", 1: "shikona", 2: "alt_shikona2", 3: "alt_shikona3"}, inplace=True)
shikona_df.head()

In [None]:
# There are two lines for each match in the original csv file, one for the winner and one for the loser.
# So as not to show duplicates of every match, we'll just show the winners with the "sumo_only_wins" list.

sumo_only_wins = results[results.rikishi1_win == int("1")] #could also use sumo.loc[sumo.rikishi1_win == 1]
sumo_only_wins

In [None]:
#Same as above, but now we'll just show the losers with the "sumo_only_losses" list.

sumo_only_losses = results[results.rikishi1_win == int("0")]
sumo_only_losses

In [None]:
# For each wrestler's ID, this tallies up their total number of wins.
win_count = {}

for r in rikishi_id_list:
    win_count[r] = len(sumo_only_wins.loc[results.rikishi1_id == r])
    
# I now create a dataframe that sorts the win tallies from highest to lowest.
sorted_win_count = sorted(win_count.items(), key=lambda x:x[1], reverse=True)

sorted_win_count_df = pd.DataFrame.from_dict(sorted_win_count)
sorted_win_count_df.columns = ['id', 'wins']
sorted_win_count_df.head()

In [None]:
# I now do the same thing but for each wrestler's total number of losses.
loss_count = {}

for r in rikishi_id_list:
    loss_count[r] = len(sumo_only_losses.loc[results.rikishi1_id == r])
    
sorted_loss_count = sorted(loss_count.items(), key=lambda x:x[1], reverse=True)

sorted_loss_count_df = pd.DataFrame.from_dict(sorted_loss_count)
sorted_loss_count_df.columns = ['id', 'losses']
sorted_loss_count_df.head()

In [None]:
# I now combine the dataframe for wins the with dataframe for losses.
won_loss_df = pd.merge(sorted_win_count_df, sorted_loss_count_df)
won_loss_df

In [None]:
# Now to add wins and losses to get the total number of matches for each wrestler.

matches_list = []

for i in range(len(won_loss_df)):
    matches = int(
        won_loss_df[won_loss_df.id == won_loss_df["id"][i]].wins.iloc[0]
    ) + int((won_loss_df[won_loss_df.id == won_loss_df["id"][i]].losses.iloc[0]))
    matches_list.append(matches)

won_loss_df["matches"] = matches_list
won_loss_df = won_loss_df.iloc[:, [0, 3, 1, 2]]
won_loss_df

In [None]:
# With the won-loss totals in a single dataframe, now I calculate each wrestler's winning percentage.
pct_list = []

for i in range(len(won_loss_df)):
    pct = round(int(won_loss_df[won_loss_df.id == won_loss_df['id'][i]].wins.iloc[0]) / int(
    (won_loss_df[won_loss_df.id == won_loss_df['id'][i]].wins.iloc[0])
    + int(won_loss_df[won_loss_df.id == won_loss_df['id'][i]].losses.iloc[0])
), 2)
    pct_list.append(pct)

won_loss_df["pct"] = pct_list
# won_loss_df.loc[won_loss_df["pct"] == 0, "pct"] = pct_list
won_loss_df

In [None]:
# This takes the won_loss_df and merges it with the shikona_df

new_won_loss_df = pd.merge(won_loss_df, shikona_df)
new_won_loss_df = new_won_loss_df.iloc[:, [0, 5, 1, 2, 3, 4, 6, 7]]
new_won_loss_df.head()

In [None]:
# The information in rikishi1_rank includes a designation of "e" or "w" ("east" or "west" on the banzuke), but as that is immaterial to ranking, we can strip it out.
# A simple excision of the final character doesn't work, so I can't do the following: sumo.rikishi1_rank.str[:-1]
# See, some rankings appear on the spreadsheet with an "HD" at the end, so I have to work with a list of characters to remove in order to clean this out.

char_to_remove = ["e", "w", "HD"]
cleaned_rankings = results["rikishi1_rank"]
for character in char_to_remove:
    cleaned_rankings = cleaned_rankings.str.replace(character, "")

results_cleaned_rank = results
results_cleaned_rank["rikishi1_rank"] = cleaned_rankings

results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("Ms", "makushita") # temporarily lowercase so as not to affect "M" in "Maegashira"
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("J", "Juryo")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("M", "Maegashira")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("makushita", "Makushita") # now that "Maegashira" is done, change back to uppercase
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("Y1YO", "Yokozuna") # usually "Yokozuna" is either "Y1" or "Y2," but this is one outlier in the data
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("Y2YO", "Yokozuna")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("Y1", "Yokozuna")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("Y2", "Yokozuna")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("O1", "Ozeki")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("O2", "Ozeki")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("O3", "Ozeki")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("S1", "Sekiwake")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("S2", "Sekiwake")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("S3", "Sekiwake")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("K1", "Komusubi")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("K2", "Komusubi")
results_cleaned_rank["rikishi1_rank"] = results_cleaned_rank["rikishi1_rank"].str.replace("K3", "Komusubi")
results_cleaned_rank


In [None]:
# Here I want to create a dictionary that lists each wrestler by ID number as a key, and the value for that key is a list of each rank that wrestler
# had during the top-division matches covered by the data. Obviously, each wrestler had a career in the lower ranks before reaching the top division
# in the first place, and that is not reflected here. However, the main goal with this is to eventually determine the best rank for every top-divsion
# wrestler, so the focus will eventually be on the higher ranks anyway.

rank_dict = {}
rank_list = []

for r in rikishi_id_list:
    for i in results.loc[results.rikishi1_id == r]["rikishi1_rank"].unique(): # Repetitions of a rank won't be necessary, thus unique() is sufficient.
        rank_list.append(i)
        rank_list
    rank_dict[r] = rank_list
    rank_list = []

rank_dict

In [None]:
# This next section trims down each list of ranks that a wrestler has had, keeping only the highest rank. So the value of the dictionary key,
# which includes a list of multiple ranks, is now converted into a list with a single entry, that of the highest rank achieved.
# I originally wanted this loop to get rid of the list completely, but that caused problems with exact matching between an element in a list
# using "in value" and non-exact matching using the same "in value" when the item had been converted into a string. The only way I found to
# prevent this kind of erroneous overriding was to keep the adjusted value as a list.

rank_dict_temp = {}

for key, value in rank_dict.items():
    for rikishi_id in rank_dict.values():
        if "Yokozuna" in value:
            value = ["Yokozuna"]
            best_rank = "Yokozuna"
        elif "Ozeki" in value:
            value = ["Ozeki"]
            best_rank = "Ozeki"
        elif "Sekiwake" in value:
            value = ["Sekiwake"]
            best_rank = "Sekiwake"
        elif "Komusubi" in value:
            value = ["Komusubi"]
            best_rank = "Komusubi"
        elif "Maegashira1" in value:
            value = ["Maegashira1"]
            best_rank = "Maegashira1"
        elif "Maegashira2" in value:
            value = ["Maegashira2"]
            best_rank = "Maegashira2"
        elif "Maegashira3" in value:
            value = ["Maegashira3"]
            best_rank = "Maegashira3"
        elif "Maegashira4" in value:
            value = ["Maegashira4"]
            best_rank = "Maegashira4"
        elif "Maegashira5" in value:
            value = ["Maegashira5"]
            best_rank = "Maegashira5"
        elif "Maegashira6" in value:
            value = ["Maegashira6"]
            best_rank = "Maegashira6"
        elif "Maegashira7" in value:
            value = ["Maegashira7"]
            best_rank = "Maegashira7"
        elif "Maegashira8" in value:
            value = ["Maegashira8"]
            best_rank = "Maegashira8"
        elif "Maegashira9" in value:
            value = ["Maegashira9"]
            best_rank = "Maegashira9"
        elif "Maegashira10" in value:
            value = ["Maegashira10"]
            best_rank = "Maegashira10"
        elif "Maegashira11" in value:
            value = ["Maegashira11"]
            best_rank = "Maegashira11"
        elif "Maegashira12" in value:
            value = ["Maegashira12"]
            best_rank = "Maegashira12"
        elif "Maegashira13" in value:
            value = ["Maegashira13"]
            best_rank = "Maegashira13"
        elif "Maegashira14" in value:
            value = ["Maegashira14"]
            best_rank = "Maegashira14"
        elif "Maegashira15" in value:
            value = ["Maegashira15"]
        elif "Maegashira16" in value:
            value = ["Maegashira16"]
        elif "Maegashira17" in value:
            value = ["Maegashira17"]
        elif "Maegashira18" in value:
            value = ["Maegashira18"]
        elif "Maegashira19" in value:
            value = ["Maegashira19"]
        elif "Maegashira20" in value:
            value = ["Maegashira20"]
        elif "Maegashira21" in value:
            value = ["Maegashira21"]
        elif "Maegashira22" in value:
            value = ["Maegashira22"]
        elif "Maegashira23" in value:
            value = ["Maegashira23"]
        elif "Maegashira24" in value:
            value = ["Maegashira24"]
        elif "Maegashira25" in value:
            value = ["Maegashira25"]
        elif "Maegashira26" in value:
            value = ["Maegashira26"]
        elif "Maegashira27" in value:
            value = ["Maegashira27"]
        elif "Maegashira28" in value:
            value = ["Maegashira28"]
        elif "Maegashira29" in value:
            value = ["Maegashira29"]
        elif "Maegashira30" in value:
            value = ["Maegashira30"]
        elif "Juryo1" in value:
            value = ["Juryo1"]
        elif "Juryo2" in value:
            value = ["Juryo2"]
        elif "Juryo3" in value:
            value = ["Juryo3"]
        elif "Juryo4" in value:
            value = ["Juryo4"]
        elif "Juryo5" in value:
            value = ["Juryo5"]
        elif "Juryo6" in value:
            value = ["Juryo6"]
        elif "Juryo7" in value:
            value = ["Juryo7"]
        elif "Juryo8" in value:
            value = ["Juryo8"]
        elif "Juryo9" in value:
            value = ["Juryo9"]
        elif "Juryo10" in value:
            value = ["Juryo10"]
        elif "Juryo11" in value:
            value = ["Juryo11"]
        elif "Juryo12" in value:
            value = ["Juryo12"]
        elif "Juryo13" in value:
            value = ["Juryo13"]
        elif "Juryo14" in value:
            value = ["Juryo14"]
        elif "Juryo15" in value:
            value = ["Juryo15"]
        elif "Juryo16" in value:
            value = ["Juryo16"]
        elif "Juryo17" in value:
            value = ["Juryo17"]
        elif "Juryo18" in value:
            value = ["Juryo18"]
        elif "Juryo19" in value:
            value = ["Juryo19"]
        elif "Juryo20" in value:
            value = ["Juryo20"]
        elif "Juryo21" in value:
            value = ["Juryo21"]
        elif "Juryo22" in value:
            value = ["Juryo22"]
        elif "Juryo23" in value:
            value = ["Juryo23"]
        elif "Juryo24" in value:
            value = ["Juryo24"]
        elif "Juryo25" in value:
            value = ["Juryo25"]
        elif "Juryo26" in value:
            value = ["Juryo26"]
        elif "Juryo27" in value:
            value = ["Juryo27"]
        elif "Juryo28" in value:
            value = ["Juryo28"]
        elif "Juryo29" in value:
            value = ["Juryo29"]
        elif "Juryo30" in value:
            value = ["Juryo30"]
        elif "Makushita1" in value:
            value = ["Makushita1"]
        elif "Makushita2" in value:
            value = ["Makushita2"]
        elif "Makushita3" in value:
            value = ["Makushita3"]
        elif "Makushita4" in value:
            value = ["Makushita4"]
        elif "Makushita5" in value:
            value = ["Makushita5"]
        elif "Makushita6" in value:
            value = ["Makushita6"]
        elif "Makushita7" in value:
            value = ["Makushita7"]
        elif "Makushita8" in value:
            value = ["Makushita8"]
        elif "Makushita9" in value:
            value = ["Makushita9"]
        elif "Makushita10" in value:
            value = ["Makushita10"]
        elif "Makushita11" in value:
            value = ["Makushita11"]
        elif "Makushita12" in value:
            value = ["Makushita12"]
        elif "Makushita13" in value:
            value = ["Makushita13"]
        elif "Makushita14" in value:
            value = ["Makushita14"]
        elif "Makushita15" in value:
            value = ["Makushita15"]
        elif "Makushita16" in value:
            value = ["Makushita16"]
        elif "Makushita17" in value:
            value = ["Makushita17"]
        elif "Makushita18" in value:
            value = ["Makushita18"]
        elif "Makushita19" in value:
            value = ["Makushita19"]
        elif "Makushita20" in value:
            value = ["Makushita20"]
        elif "Makushita21" in value:
            value = ["Makushita21"]
        elif "Makushita22" in value:
            value = ["Makushita22"]
        elif "Makushita23" in value:
            value = ["Makushita23"]
        elif "Makushita24" in value:
            value = ["Makushita24"]
        elif "Makushita25" in value:
            value = ["Makushita25"]
        elif "Makushita26" in value:
            value = ["Makushita26"]
        elif "Makushita27" in value:
            value = ["Makushita27"]
        elif "Makushita28" in value:
            value = ["Makushita28"]
        elif "Makushita29" in value:
            value = ["Makushita29"]
        elif "Makushita30" in value:
            value = ["Makushita30"]
    rank_dict_temp[key] = value

# In what follows, I create a dictionary that features a wrestler's ID as key and top rank as value, this value cleaned up to be a string instead of a list.
# However, in the next cell, I figured out a way to work with rank_dict_temp, so this new dictionary is not neccessary. Still, I'm leaving the code here
# in case I need to reference it in the future.

# top_rank_dict = {}
# for key, value in rank_dict_temp.items():
#     value_to_string = value[0]
#     top_rank_dict[key] = value_to_string

# This creates a dataframe with each wrestler's ID as well as the top rank achieved by that wrestler.
top_rank_df = pd.DataFrame.from_dict(rank_dict_temp, orient="index")
top_rank_df = top_rank_df.reset_index()
top_rank_df.columns = ["id", "top_rank"]
top_rank_df

In [None]:
# Now to merge the dataframe with the top rank to the main won-loss dataframe.
new_won_loss_df = pd.merge(new_won_loss_df, top_rank_df)
new_won_loss_df = new_won_loss_df.iloc[:, [0, 1, 2, 3, 4, 5, 8, 6, 7]]
new_won_loss_df

In [None]:
kimarite_dict = {}
kimarite_list = []

for r in rikishi_id_list:
    for i in results.loc[results.rikishi1_id == r]["kimarite"]:
        kimarite_list.append(i)
        kimarite_list
    kimarite_dict[r] = kimarite_list
    # kimarite_list = []
    

# for key, value in kimarite_dict.items():
#     print(value)


# kimarite_dict

# This just lists anyone who has ever been a yokozuna.
# for key, value in kimarite_dict.items():
#     for rikishi_id in kimarite_dict.values():
#         if "yorikiri" in value:
#             print(key, value)

In [None]:
# Every wrestler with 500+ wins in the top division.

with_500_wins = new_won_loss_df.loc[new_won_loss_df.wins >=500].sort_values(by='wins', ascending=False)
with_500_wins = with_500_wins.iloc[:, [0,1,2,3,4,5]]
with_500_wins = with_500_wins.sort_values(by='pct', ascending=False)
with_500_wins

In [None]:
# dff = new_won_loss_df.loc[(new_won_loss_df.wins + new_won_loss_df.losses)>=500].sort_values(by='pct', ascending=False)
# dff = dff.iloc[:, [0,1,2,3,4,5]]
# dff

# dff.to_csv("data/won_loss_df.csv", index=False)
# dff

<!-- TO DO: re-do all those win/loss things, but with ID instead of Shikona -->