In [1]:
import pandas as pd

## investigations:

#### problem:

In [2]:
event_df = (
    pd.read_hdf("event_df.h5")
    .pivot_table(
        columns=["event"],
        index=["match_id", "person_id"],
        values="time",
        aggfunc="count",
    )
    .fillna(0)
)

In [8]:
event_df.loc[
    lambda df: (df.loc[:, ["2nd-y-card", "r-card", "y-card"]] > 1).any(axis=1), :
].sort_values(["r-card", "2nd-y-card"], ascending=False).head(10)

Unnamed: 0_level_0,event,2nd-y-card,goal,own-goal,penalty-goal,penalty-miss,r-card,y-card
match_id,person_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
651518,12660,0.0,0.0,0.0,0.0,0.0,2.0,0.0
658559,49737,0.0,0.0,0.0,0.0,0.0,2.0,0.0
672051,3723,0.0,0.0,0.0,0.0,0.0,2.0,0.0
795242,12917,0.0,0.0,0.0,0.0,0.0,2.0,0.0
817867,61428,0.0,0.0,0.0,0.0,0.0,2.0,0.0
818056,18437,0.0,0.0,0.0,0.0,0.0,2.0,0.0
818084,18017,0.0,0.0,0.0,0.0,0.0,2.0,0.0
639041,12430,0.0,0.0,0.0,0.0,0.0,1.0,2.0
795242,12718,2.0,0.0,0.0,0.0,0.0,0.0,2.0
659046,12117,1.0,0.0,0.0,0.0,0.0,0.0,2.0


- small data error: https://int.soccerway.com/matches/2007/12/30/england/premier-league/derby-county-fc/blackburn-rovers-football-club/458690/
- or https://int.soccerway.com/matches/2008/04/26/england/premier-league/chelsea-football-club/manchester-united-fc/458849/
- bigger data error: 
  - https://int.soccerway.com/matches/2008/08/28/europe/uefa-cup/ac-omonia-nicosia/aek-athens-fc/672051/
  - https://int.soccerway.com/matches/2010/03/19/poland/ekstraklasa/ks-ruch-chorzow/gks-belchatow/795242/
- batshit crazy: https://int.soccerway.com/matches/2010/05/23/italy/serie-b/as-cittadella/fc-crotone/818084/

#### solution

In [11]:
event_df = (
    pd.read_hdf("event_df.h5")
    .pivot_table(
        columns=["event"],
        index=["match_id", "person_id"],
        values="time",
        aggfunc="count",
    )
    .fillna(0)
    .assign(
        **{
            "y-card": lambda df: (df["y-card"] > 0).astype(int),
            "2nd-y-card": lambda df: (df["2nd-y-card"] > 0).astype(int),
            "r-card": lambda df: (df["r-card"] > 0).astype(int),
        }
    )
)

In [12]:
event_df.loc[
    lambda df: (df.loc[:, ["2nd-y-card", "r-card", "y-card"]] > 1).any(axis=1), :
].sort_values(["r-card", "2nd-y-card"], ascending=False).head(10)

Unnamed: 0_level_0,event,2nd-y-card,goal,own-goal,penalty-goal,penalty-miss,r-card,y-card
match_id,person_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


#### problem:

In [5]:
(
    pd.read_hdf("match_df.h5")["match_id"].duplicated().sum(),
    pd.read_hdf("match_info_df.h5")["match_id"].duplicated().sum(),
    pd.read_hdf("competition_df.h5")["comp_id"].duplicated().sum(),
    pd.read_hdf("region_df.h5")["region_id"].duplicated().sum(),
    pd.read_hdf("season_df.h5")["season_id"].duplicated().sum(),
    pd.read_hdf("round_df.h5")["round_id"].duplicated().sum(),
)

(0, 0, 0, 0, 16, 349)

In [6]:
pd.read_hdf("round_df.h5").loc[
    lambda df: df["round_id"].duplicated(keep=False)
].sort_values("round_id").assign(
    seas_eq=lambda df: df["season_id"] == df["round_id"]
).groupby(
    "round_id"
)[
    "seas_eq"
].agg(
    ["mean", "count"]
).drop_duplicates()

Unnamed: 0_level_0,mean,count
round_id,Unnamed: 1_level_1,Unnamed: 2_level_1
165,0.5,2


In [84]:
pd.read_hdf("season_df.h5").loc[lambda df: df["season_id"].duplicated(keep=False), :]

Unnamed: 0,season_name,season_id,comp_id
14,2011 Clausura,5686,90
15,2011 Apertura,5686,90
17,2009 Clausura,3264,90
18,2009 Apertura,3264,90
18,2010/2011 Apertura,5353,525
19,2010/2011 Clausura,5353,525
1,2018,13604,415
2,2018,13604,415
20,2009 Clausura,3278,91
21,2009 Apertura,3278,91


#### solution:

In [20]:
# not simply discarding these:
pd.read_hdf("round_df.h5").assign(
    duped=lambda df: df["round_id"].duplicated(keep=False)
).loc[lambda df: df["season_id"] == df["round_id"], "duped"].mean()

0.6622390891840607

In [23]:
pd.read_hdf("round_df.h5").loc[
    lambda df: (df["season_id"] == df["round_id"])
    & df["round_id"].duplicated(keep=False),
    :,
]

Unnamed: 0,round_id,round_name,season_id
0,17693,,17693
0,6925,,6925
0,5071,,5071
0,9341,,9341
0,1686,,1686
...,...,...,...
0,12403,,12403
0,13548,,13548
0,14405,,14405
0,16475,,16475


In [24]:
pd.read_hdf("round_df.h5").loc[
    lambda df: ~((df["season_id"] == df["round_id"])
    & df["round_id"].duplicated(keep=False)),
    :,
]

Unnamed: 0,round_id,round_name,season_id
0,18495,,18495
0,9144,,9144
0,14375,,14375
0,26875,Final,9501
1,26873,Semi-finals,9501
...,...,...,...
2,14502,1st Round,5927
0,30217,Final,10981
1,30216,Semi-finals,10981
2,30215,Group Stage,10981


In [16]:
pd.read_hdf("season_df.h5").drop_duplicates(subset=["season_id", "comp_id"]).loc[
    lambda df: df["season_id"].duplicated(keep=False), :
]

Unnamed: 0,season_name,season_id,comp_id


#### problem

In [15]:
pd.read_hdf("coach_df.h5").duplicated().sum()

128

- also, this, but this is just stupid:
  - https://int.soccerway.com/matches/2005/09/25/turkey/super-lig/diyarbakirspor/mke-ankaragucu/285687/
  - https://int.soccerway.com/matches/2010/05/05/slovakia/super-liga/mfk-dubnica/mk-ilina/800776/

#### problem

In [27]:
pd.concat(
    [
        pd.read_hdf("lineup_df.h5").drop_duplicates(),
        pd.read_hdf("sub_df.h5").drop_duplicates().rename(columns={"sub_id": "person_id"}),
    ]
).duplicated(subset=["person_id", "match_id", "side"]).sum()

211

https://int.soccerway.com/matches/2011/12/07/spain/primera-division/granada-club-de-futbol/real-club-deportivo-mallorca/1140756/

#### solution

In [None]:
pd.concat(
    [
        pd.read_hdf("lineup_df.h5").drop_duplicates(),
        pd.read_hdf("sub_df.h5").drop_duplicates().rename(columns={"sub_id": "person_id"}),
    ]
).drop_duplicates(subset=["person_id", "match_id", "side"], keep="first")

#### problem

In [None]:
pd.read_hdf("lineup_df.h5").groupby("match_id")["person_id"].count().sort_values().head(10)

##### not really a problem...
- https://int.soccerway.com/matches/2016/02/27/world/friendlies/egypt/burkina-faso/2207539/

#### problem

In [14]:
pd.read_hdf("sidelined_df.h5").drop_duplicates().loc[
    lambda df: df.duplicated(subset=["match_id", "player_id"], keep=False), :
].sort_values(["match_id", "player_id"])

Unnamed: 0,side,reason,match_id,player_id
67356,away,Hamstring,943459,193
67357,away,Virus,943459,193
12802,home,Ankle/Foot Injury,943513,15721
12803,home,Thigh Muscle Strain,943513,15721
4021,away,Ankle/Foot Injury,943523,15721
...,...,...,...,...
3515,home,Hamstring,2857322,335229
77430,away,Suspended,2857326,1458
77432,away,Hamstring,2857326,1458
12646,home,Groin Strain,2948674,265372


- https://int.soccerway.com/matches/2010/08/22/england/premier-league/fulham-football-club/manchester-united-fc/943459/