In [13]:
import pandas as pd

results = pd.read_csv("data/results.csv", dtype={'basho': 'string', 'day': 'string', 'rikishi1_id': 'int64', 'rikishi1_rank': 'string', 'rikishi1_shikona': 'string', 'rikishi1_result': 'string', 'rikishi1_win': 'int64', 'kimarite': 'string', 'rikishi2_id': 'int64', 'rikishi2_rank': 'string', 'rikishi2_shikona': 'string', 'rikishi2_result': 'string', 'rikishi2_win': 'string'})

In [14]:
results

Unnamed: 0,basho,day,rikishi1_id,rikishi1_rank,rikishi1_shikona,rikishi1_result,rikishi1_win,kimarite,rikishi2_id,rikishi2_rank,rikishi2_shikona,rikishi2_result,rikishi2_win
0,1983.01,1,4140,J13w,Chikubayama,0-1 (7-8),0,yorikiri,4306,Ms1e,Ofuji,1-0 (6-1),1
1,1983.01,1,4306,Ms1e,Ofuji,1-0 (6-1),1,yorikiri,4140,J13w,Chikubayama,0-1 (7-8),0
2,1983.01,1,1337,J12w,Tochitsukasa,1-0 (9-6),1,oshidashi,4323,J13e,Shiraiwa,0-1 (3-12),0
3,1983.01,1,4323,J13e,Shiraiwa,0-1 (3-12),0,oshidashi,1337,J12w,Tochitsukasa,1-0 (9-6),1
4,1983.01,1,4097,J12e,Tamakiyama,0-1 (8-7),0,yorikiri,4319,J11w,Harunafuji,1-0 (5-10),1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
227767,2023.07,15,11985,S1w,Daieisho,9-6,0,hikiotoshi,11855,M9e,Takanosho,8-7,1
227768,2023.07,15,12451,S1e,Hoshoryu,12-3,1,uwatenage,12796,M17w,Hakuoho,11-4,0
227769,2023.07,15,12796,M17w,Hakuoho,11-4,0,uwatenage,12451,S1e,Hoshoryu,12-3,1
227770,2023.07,15,12094,K1w,Abi,6-9,1,yorikiri,12231,O1w,Kirishima,6-7-2,0


In [15]:
# Sumo wrestlers in these datasets can best be identified by their unique ID numbers. Although each wrestler has his own name (shikona), some wrestlers
# may reuse the same name, and other wrestlers may change names during their career. Since the ID number is the one identifier that remains stable,
# this is the best identifier to which I will anchor my analysis. Here I make a quick list of unique ID numbers for later use.

rikishi_id_list = []

for i in results["rikishi1_id"].unique():

    rikishi_id_list.append(i)

# Next to make a dictionary that holds each wrestler's unique ID number as a key, with values being each name associated with that wrestler.
# The vast majority of ID numbers will only feature one name, but this will make sure to include any additional names as well.

rikishi_id_dict = {}

# Initializes a key for every number sumo["rikishi1_id"] but still only adds one shikona (wrestler name) as a value per key, overwriting old shikona.
# So while this fills out the dictionary, it remains incomplete.
for i in range(len(results)):
    rikishi_id_dict.update({results["rikishi1_id"][i]: [results["rikishi1_shikona"][i]]})


# This fills out the dictionary with all associated names.
# Easy to test on rikishi_id_dict[1111], which shows both names for this wrestler, "Ama" and "Harumafuji."
# The first entry in every names list is the *most recent* name the wrestler used. So rikishi_id_dict[1111][0] shows "Harumafuji."
# This is useful to know, because when referring to a wrestler in the present day, one generally refers to them by their most current name.
# So when we reference this dictionary to indicate a wrestler's name, we will go by their most recent name, even if they have used others in the past.
for index, row in results.iterrows():
    if row["rikishi1_shikona"] not in rikishi_id_dict[row["rikishi1_id"]]:
        rikishi_id_dict[row["rikishi1_id"]].append(row["rikishi1_shikona"])

rikishi_id_dict

{4140: ['Chikubayama'],
 4306: ['Ofuji'],
 1337: ['Tochitsukasa'],
 4323: ['Shiraiwa'],
 4097: ['Tamakiyama'],
 4319: ['Harunafuji'],
 4109: ['Kotogatake'],
 4129: ['Hakuryuyama'],
 4111: ['Zaonishiki'],
 4130: ['Hachiya'],
 1302: ['Takamisugi'],
 4318: ['Tengoyama'],
 1361: ['Daitetsu'],
 4316: ['Kakureizan'],
 1385: ['Tochiakagi'],
 4125: ['Shinko'],
 1347: ['Itai'],
 4311: ['Hakuryu'],
 1332: ['Sakahoko'],
 4322: ['Sanofuji'],
 1380: ['Hidanohana'],
 4128: ['Kotochitose'],
 1350: ['Onokuni'],
 1359: ['Koboyama'],
 4087: ['Wakajishi'],
 4091: ['Washuyama'],
 1342: ['Asahifuji'],
 4103: ['Banryuyama'],
 4122: ['Kurosegawa'],
 4131: ['Tochitsurugi'],
 1348: ['Jingaku'],
 4124: ['Shishiho'],
 1334: ['Wakasegawa'],
 4118: ['Amanoyama'],
 1345: ['Tamaryu'],
 4100: ['Kaneshiro', 'Tochihikari'],
 1352: ['Tagaryu'],
 4132: ['Wakanofuji'],
 4050: ['Takamiyama'],
 4117: ['Misugiiso', 'Azumanada'],
 1378: ['Hoo'],
 4133: ['Takarakuni', 'Saisu'],
 1338: ['Ozutsu'],
 4077: ['Fujizakura'],
 1375: 

In [16]:
# Now to creates a dataframe using this dictionary, listing each wrestler's ID with any shikona (name) associated with him.
shikona_df = pd.DataFrame.from_dict([([key] + value) for key, value in rikishi_id_dict.items()])
shikona_df.rename(columns={0: "id", 1: "current_shikona", 2: "alt_shikona2", 3: "alt_shikona3"}, inplace=True)
shikona_df.head()

Unnamed: 0,id,current_shikona,alt_shikona2,alt_shikona3
0,4140,Chikubayama,,
1,4306,Ofuji,,
2,1337,Tochitsukasa,,
3,4323,Shiraiwa,,
4,4097,Tamakiyama,,


In [17]:
# There are two lines for each match in the original csv file, one for the winner and one for the loser.
# So as not to show duplicates of every match, we'll just show the winners with the "sumo_only_wins" list.

sumo_only_wins = results[results.rikishi1_win == int("1")] #could also use sumo.loc[sumo.rikishi1_win == 1]

sumo_only_wins

Unnamed: 0,basho,day,rikishi1_id,rikishi1_rank,rikishi1_shikona,rikishi1_result,rikishi1_win,kimarite,rikishi2_id,rikishi2_rank,rikishi2_shikona,rikishi2_result,rikishi2_win
1,1983.01,1,4306,Ms1e,Ofuji,1-0 (6-1),1,yorikiri,4140,J13w,Chikubayama,0-1 (7-8),0
2,1983.01,1,1337,J12w,Tochitsukasa,1-0 (9-6),1,oshidashi,4323,J13e,Shiraiwa,0-1 (3-12),0
5,1983.01,1,4319,J11w,Harunafuji,1-0 (5-10),1,yorikiri,4097,J12e,Tamakiyama,0-1 (8-7),0
7,1983.01,1,4129,J11e,Hakuryuyama,1-0 (3-12),1,tsukidashi,4109,J10w,Kotogatake,0-1 (7-8),0
9,1983.01,1,4130,J9w,Hachiya,1-0 (8-7),1,hatakikomi,4111,J10e,Zaonishiki,0-1 (4-11),0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
227763,2023.07,15,12270,K1e,Kotonowaka,11-4,1,yorikiri,6594,M15e,Ryuden,10-5,0
227765,2023.07,15,12291,M4e,Asanoyama,8-4-3,1,yorikiri,11980,S2w,Wakamotoharu,9-6,0
227766,2023.07,15,11855,M9e,Takanosho,8-7,1,hikiotoshi,11985,S1w,Daieisho,9-6,0
227768,2023.07,15,12451,S1e,Hoshoryu,12-3,1,uwatenage,12796,M17w,Hakuoho,11-4,0


In [18]:
#Same as above, but now we'll just show the losers with the "sumo_only_losses" list.
sumo_only_losses = results[results.rikishi1_win == int("0")]

sumo_only_losses

Unnamed: 0,basho,day,rikishi1_id,rikishi1_rank,rikishi1_shikona,rikishi1_result,rikishi1_win,kimarite,rikishi2_id,rikishi2_rank,rikishi2_shikona,rikishi2_result,rikishi2_win
0,1983.01,1,4140,J13w,Chikubayama,0-1 (7-8),0,yorikiri,4306,Ms1e,Ofuji,1-0 (6-1),1
3,1983.01,1,4323,J13e,Shiraiwa,0-1 (3-12),0,oshidashi,1337,J12w,Tochitsukasa,1-0 (9-6),1
4,1983.01,1,4097,J12e,Tamakiyama,0-1 (8-7),0,yorikiri,4319,J11w,Harunafuji,1-0 (5-10),1
6,1983.01,1,4109,J10w,Kotogatake,0-1 (7-8),0,tsukidashi,4129,J11e,Hakuryuyama,1-0 (3-12),1
8,1983.01,1,4111,J10e,Zaonishiki,0-1 (4-11),0,hatakikomi,4130,J9w,Hachiya,1-0 (8-7),1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
227762,2023.07,15,6594,M15e,Ryuden,10-5,0,yorikiri,12270,K1e,Kotonowaka,11-4,1
227764,2023.07,15,11980,S2w,Wakamotoharu,9-6,0,yorikiri,12291,M4e,Asanoyama,8-4-3,1
227767,2023.07,15,11985,S1w,Daieisho,9-6,0,hikiotoshi,11855,M9e,Takanosho,8-7,1
227769,2023.07,15,12796,M17w,Hakuoho,11-4,0,uwatenage,12451,S1e,Hoshoryu,12-3,1


In [19]:
# For each wrestler's ID, this tallies up their total number of wins.
win_count = {}

for r in rikishi_id_list:
    win_count[r] = len(sumo_only_wins.loc[results.rikishi1_id == r])
    
# I now create a dataframe that sorts the win tallies from highest to lowest.
sorted_win_count = sorted(win_count.items(), key=lambda x:x[1], reverse=True)

sorted_win_count_df = pd.DataFrame.from_dict(sorted_win_count)
sorted_win_count_df.columns = ['id', 'wins']
sorted_win_count_df.head()


Unnamed: 0,id,wins
0,1123,1121
1,7,956
2,89,831
3,41,827
4,39,781


In [20]:
# I now do the same thing but for each wrestler's total number of losses.
loss_count = {}

for r in rikishi_id_list:
    loss_count[r] = len(sumo_only_losses.loc[results.rikishi1_id == r])
    
sorted_loss_count = sorted(loss_count.items(), key=lambda x:x[1], reverse=True)

sorted_loss_count_df = pd.DataFrame.from_dict(sorted_loss_count)
sorted_loss_count_df.columns = ['id', 'losses']
sorted_loss_count_df.head()

Unnamed: 0,id,losses
0,41,888
1,89,869
2,33,855
3,1284,744
4,13,731


In [21]:
# I now combine the dataframe for wins the with dataframe for losses.
won_loss_df = pd.merge(sorted_win_count_df, sorted_loss_count_df)
won_loss_df

Unnamed: 0,id,wins,losses
0,1123,1121,212
1,7,956,642
2,89,831,869
3,41,827,888
4,39,781,692
...,...,...,...
592,11965,0,1
593,12141,0,1
594,12369,0,1
595,11949,0,2


In [22]:
# With the won-loss totals in a single dataframe, now I calculate each wrestler's winning percentage.
pct_list = []

for i in range(len(won_loss_df)):
    pct = round(int(won_loss_df[won_loss_df.id == won_loss_df['id'][i]].wins.iloc[0]) / int(
    (won_loss_df[won_loss_df.id == won_loss_df['id'][i]].wins.iloc[0])
    + int(won_loss_df[won_loss_df.id == won_loss_df['id'][i]].losses.iloc[0])
), 2)
    pct_list.append(pct)
pct_list

won_loss_df["pct"] = pct_list
won_loss_df


Unnamed: 0,id,wins,losses,pct
0,1123,1121,212,0.84
1,7,956,642,0.60
2,89,831,869,0.49
3,41,827,888,0.48
4,39,781,692,0.53
...,...,...,...,...
592,11965,0,1,0.00
593,12141,0,1,0.00
594,12369,0,1,0.00
595,11949,0,2,0.00


In [26]:
# This takes the won_loss_df and merges it with the shikona_df

new_won_loss_df = pd.merge(won_loss_df, shikona_df)
new_won_loss_df = new_won_loss_df.iloc[:, [0, 4, 1, 2, 3, 5, 6]]
new_won_loss_df.head()


Unnamed: 0,id,current_shikona,wins,losses,pct,alt_shikona2,alt_shikona3
0,1123,Hakuho,1121,212,0.84,,
1,7,Kaio,956,642,0.6,Koga,
2,89,Aminishiki,831,869,0.49,,
3,41,Kyokutenho,827,888,0.48,,
4,39,Wakanosato,781,692,0.53,Kogawa,


In [27]:
# Every wrestler with 500+ wins in the top division.

with_500_wins = new_won_loss_df.loc[won_loss_df.wins >=500].sort_values(by='wins', ascending=False)
with_500_wins = with_500_wins.iloc[:, [0,1,2,3,4]]
with_500_wins

Unnamed: 0,id,current_shikona,wins,losses,pct
0,1123,Hakuho,1121,212,0.84
1,7,Kaio,956,642,0.6
2,89,Aminishiki,831,869,0.49
3,41,Kyokutenho,827,888,0.48
4,39,Wakanosato,781,692,0.53
5,1226,Kotoshogiku,760,639,0.54
6,1111,Harumafuji,752,402,0.65
7,2,Takanohana,751,253,0.75
8,1235,Kisenosato,742,472,0.61
9,33,Terao,740,855,0.46


<!-- TO DO: re-do all those win/loss things, but with ID instead of Shikona -->