In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz

In [2]:
batsman = pd.read_csv("./data/batting_stats_overall.csv")
bowlers = pd.read_csv("./data/bowling_stats_overall.csv")

In [3]:
bowlers.rename(columns = {"SR": "bowling_SR", "Ave": "bowling_Ave"}, inplace=True)

In [4]:
auction_players = pd.read_excel("./data/WT20 Auction Player List.xlsx", sheet_name="Sheet1")

In [5]:
df = batsman.merge(bowlers, left_index=True, right_index=True, on = "Player",
                 how='inner', suffixes=('', '_y'))
df.drop(df.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)

In [6]:
def calculate_num_seasons(span):
    years = [2007,2009,2010,2012,2014, 2016]
    start, end = span.split("-")
    if start == end:
        return 1
    start_idx = years.index(int(start))
    end_idx = years.index(int(end))
    return (end_idx - start_idx) + 1

In [7]:
df["num_years"] = df["Span"].apply(lambda x: calculate_num_seasons(x))

In [8]:
more_than_one_season = df[df.num_years > 1].reset_index(drop=True)

In [9]:
more_than_one_season.columns

Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR',
       '100', '50', '0', '4s', '6s', 'country', 'Overs', 'Mdns', 'Wkts', 'BBI',
       'bowling_Ave', 'Econ', 'bowling_SR', '4', '5', 'Ct', 'St', 'num_years'],
      dtype='object')

In [10]:
def search_word(query, searchList, threshold):
    ratios = [fuzz.partial_ratio(player, query) for player in searchList]
    if max(ratios) < threshold:
        return "NA"
    return searchList[ratios.index(max(ratios))]

In [11]:
more_than_one_season["matched_name"] = more_than_one_season["Player"].apply(lambda x: search_word(x, auction_players.Name, 70))

In [12]:
batsman_stats = ["Runs", "SR", "100", "50", "4s", "6s"]
bowling_stats = ["Mdns", "Wkts", "4", "5"]

In [13]:
def batsman_points(player):
    # Each run is 1 point, for SR > 200 - 20 pts, SR > 150 - 10 SR > 100 - 5pts SR < 100 - -2 pts
    # For every 100 - 10 pts, For every 50 - 5 pts, For every 4's - 3 pts . For every 6's - 6 pts
    total = 0
    for stat in batsman_stats:
        value = player[stat]
        if value != "-":
            if stat == "Runs":
                value = int(value)
                total+= value * 1
            elif stat == "SR":
                sr = float(value)
                if sr > 200:
                    total += 20
                elif sr > 150:
                    total += 10
                elif sr > 100:
                    total += 5
                else:
                    total -= 2
            elif stat == "100":
                total += int(value) * 10
            elif stat == "50":
                total += int(value) * 5
            elif stat == "4s":
                total += int(value) * 3
            elif stat == "6s":
                total += int(value) * 6
    return total

In [14]:
def bowling_points(player):
    # Each Maidens will fetch 10 points. Each wicket will fetch 5 points. Each 4 wicket haul, will fetch 15 points
    # Each 5 wicket haul will fetch 25 points.
    total = 0
    for stat in bowling_stats:
        value = player[stat]
        if value != "-":
            if stat == "Mdns":
                total+= int(value) * 10
            elif stat == "Wkts":
                total += int(value) * 5
            elif stat == "4":
                total += int(value) * 15
            elif stat == "5":
                total += int(value) * 25
    return total

In [15]:
more_than_one_season["batsman_points"] = more_than_one_season.apply(lambda x: batsman_points(x), axis=1)
more_than_one_season["bowler_points"] = more_than_one_season.apply(lambda x: bowling_points(x), axis=1)

In [16]:
more_than_one_season["batsman_points_per_year"] = more_than_one_season["batsman_points"]/more_than_one_season["num_years"]
more_than_one_season["bowler_points_per_year"] = more_than_one_season["bowler_points"]/more_than_one_season["num_years"]

In [38]:
batsman_stats_df = more_than_one_season.sort_values(["batsman_points_per_year"], ascending=False).head(100)[["Player", "num_years", "batsman_points", "batsman_points_per_year", "SR", "Ave", "Runs", "country"]]

In [37]:
bowler_stats_df = more_than_one_season.sort_values(["bowler_points_per_year"], ascending=False).head(100)[["Player", "num_years", "bowler_points", "bowler_points_per_year", "Econ", "bowling_SR", "Wkts", "bowling_Ave", "country"]]

In [19]:
common_players = set(batsman_stats_df.Player).intersection(set(bowler_stats_df.Player))

In [20]:
batsman_stats_df[batsman_stats_df.Player.isin(common_players)]

Unnamed: 0,Player,num_years,batsman_points,batsman_points_per_year,SR,Ave,Runs,country
187,JH Kallis,3,639,213.0,117.66,36.08,433,SouthAfrica
53,SR Watson,6,876,146.0,140.94,28.26,537,Australia
69,Shakib Al Hasan,6,854,142.333333,128.86,28.35,567,Bangladesh
160,Shahid Afridi,6,839,139.833333,154.23,18.82,546,Pakistan
203,AD Mathews,5,660,132.0,129.66,38.25,459,SriLanka
213,DJ Bravo,6,762,127.0,129.23,24.0,504,WestIndies


In [21]:
bowler_stats_df[bowler_stats_df.Player.isin(common_players)]

Unnamed: 0,Player,num_years,bowler_points,bowler_points_per_year,Econ,bowling_SR,country
160,Shahid Afridi,6,235,39.166667,6.71,20.7,Pakistan
69,Shakib Al Hasan,6,180,30.0,6.64,17.6,Bangladesh
187,JH Kallis,3,85,28.333333,6.67,18.5,SouthAfrica
213,DJ Bravo,6,140,23.333333,8.87,17.4,WestIndies
203,AD Mathews,5,115,23.0,6.47,23.4,SriLanka
53,SR Watson,6,120,20.0,8.07,20.5,Australia


In [None]:
# Batsman - 3-5
# Bowlers 3-5
# AR 1-3
# Wk 1

In [None]:
# Batsman
# Bowlers
# WicketKeepers List -> Batting rating + stumpings/catches
# Associates nations list

In [26]:
wks = more_than_one_season[more_than_one_season.St > 0][["Player", "St", "Ct", "num_years", "batsman_points", "batsman_points_per_year", "SR", "Ave", "Runs", "country"]]
wks_not_needed = ["KD Karthik", "CJO Smith"]
wks = wks[~wks.Player.isin(wks_not_needed)]

In [27]:
wks.to_csv("./data/wks_plus_chandimal.csv", index=False)

In [28]:
more_than_one_season.country.unique()

array(['India', 'Afghanistan', 'Australia', 'Bangladesh', 'England',
       'HongKong', 'Ireland', 'Netherlands', 'NewZealand', 'Pakistan',
       'Scotland', 'SouthAfrica', 'SriLanka', 'WestIndies', 'Zimbabwe'],
      dtype=object)

In [30]:
associate_nations = ["HongKong", "Ireland", "Netherlands", "Scotland", "Afghanistan"]
associate_players = more_than_one_season[more_than_one_season.country.isin(associate_nations)]

In [32]:
associate_players.to_csv("./data/associate_players.csv", index=False)

In [40]:
batsman_stats_df.to_csv("./data/top_100_batsman.csv", index=False)
bowler_stats_df.to_csv("./data/top_100_bowlers.csv", index=False)

In [51]:
batsman_stats_df[batsman_stats_df.country.isin(["SriLanka", "Pakistan", "WestIndies", "NewZealand"])].sort_values(["batsman_points_per_year"], ascending=False)

Unnamed: 0,Player,num_years,batsman_points,batsman_points_per_year,SR,Ave,Runs,country
199,DPMD Jayawardene,5,1544,308.8,134.74,39.07,1016,SriLanka
218,CH Gayle,6,1565,260.833333,146.73,40.0,920,WestIndies
196,TM Dilshan,6,1355,225.833333,124.06,30.93,897,SriLanka
166,Younis Khan,2,425,212.5,124.06,29.9,299,Pakistan
133,BB McCullum,5,977,195.4,128.42,28.95,637,NewZealand
208,KC Sangakkara,5,941,188.2,112.22,25.42,661,SriLanka
198,ST Jayasuriya,3,558,186.0,121.4,21.62,346,SriLanka
163,Umar Akmal,4,716,179.0,132.42,34.71,486,Pakistan
153,Misbah-ul-Haq,3,536,178.666667,124.32,30.66,368,Pakistan
159,Salman Butt,3,508,169.333333,109.23,26.38,343,Pakistan


In [61]:
more_than_one_season[more_than_one_season.Player.isin(["SB Styris"])]

Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,...,4,5,Ct,St,num_years,matched_name,batsman_points,bowler_points,batsman_points_per_year,bowler_points_per_year
143,SB Styris,2007-2010,16,14,2,233,42,19.41,196,118.87,...,0,0,3,0,3,Scott Styris,340,50,113.333333,16.666667
