In [39]:
import sys,requests,csv,io
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
import numpy as np
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt


In [40]:
### Constants for scraping
rank_class_name = "table-body__cell table-body__cell--position u-text-right"
rating_class_name = "table-body__cell rating"
name_class_name = "table-body__cell rankings-table__name name"

banner_class_name = "rankings-block__banner"
banner_player_class_name = "rankings-block__banner--name-large"
banner_ranking_class_name = "rankings-block__position" # Always 1
banner_rating_class_name = "rankings-block__banner--rating"

batting_url = "https://www.icc-cricket.com/rankings/mens/player-rankings/test/batting"
bowling_url = "https://www.icc-cricket.com/rankings/mens/player-rankings/test/bowling"
all_rounder_url = "https://www.icc-cricket.com/rankings/mens/player-rankings/test/all-rounder"

In [41]:
### Constants for naming dataframe
player = "player"
batting_ranking = "batting_ranking"
batting_rating = "batting_rating"
bowling_ranking = "bowling_ranking"
bowling_rating = "bowling_rating"
all_rounder_ranking = "all_rounder_ranking"
all_rounder_rating = "all_rounder_rating"

In [78]:
def get_outer_data_from_tag(tag, data_tag_name, data_class_name):
  return tag.find(name=data_tag_name, attrs={'class' : data_class_name}).find(recursive=False, text=True).strip();

def get_inner_data_from_tag(tag, data_tag_name, data_class_name):
  return tag.find(data_tag_name, {'class' : data_class_name}).text.strip();

def get_dictionary_from_banner(banner_div, player_column_name, ranking_column_name, rating_column_name):
  row_dict = {}
  row_dict[player_column_name] = get_inner_data_from_tag(banner_div, "div", banner_player_class_name)
  row_dict[ranking_column_name] = get_outer_data_from_tag(banner_div, "td", banner_ranking_class_name)
  row_dict[rating_column_name] = get_outer_data_from_tag(banner_div, "div", banner_rating_class_name) 
  return row_dict

def get_dataframe_from_ranking(ranking_div, player_column_name, ranking_column_name, rating_column_name):
  batting_rows_list = []

  # Handle single top-rated player (banner player)
  banner_div = ranking_div.find('tr', {'class' : banner_class_name})
  batting_rows_list.append(get_dictionary_from_banner(banner_div, player_column_name, ranking_column_name, rating_column_name))

  # Handle all other players
  batting_trs = ranking_div.find_all("tr", {'class' : "table-body"})
  for index, tr in enumerate(batting_trs):
    row_dict = {}
    row_dict[player_column_name] = get_inner_data_from_tag(tr, "td", name_class_name)
    row_dict[ranking_column_name] = get_outer_data_from_tag(tr, "td", rank_class_name)
    row_dict[rating_column_name] = get_outer_data_from_tag(tr, "td", rating_class_name) 

    # Handle ranking ties
    if (row_dict[ranking_column_name] == ""):
      row_dict[ranking_column_name] = index + 1
    
    batting_rows_list.append(row_dict)
  
  return pd.DataFrame(batting_rows_list, columns=[player_column_name, ranking_column_name, rating_column_name])

In [79]:
batting_request = requests.get(batting_url)
bowling_request = requests.get(bowling_url)
all_rounder_request = requests.get(all_rounder_url)

batting_soup = BeautifulSoup(batting_request.content, "lxml")
bowling_soup = BeautifulSoup(bowling_request.content, "lxml")
all_rounder_soup = BeautifulSoup(all_rounder_request.content, "lxml")

In [80]:
batting_outer_div = batting_soup.find('div', {'class' : 'rankings-block__container full '})
bowling_outer_div = bowling_soup.find('div', {'class' : 'rankings-block__container full '})
all_rounder_outer_div = all_rounder_soup.find('div', {'class' : 'rankings-block__container full '})

In [81]:
batting_df = get_dataframe_from_ranking(batting_outer_div, player, batting_ranking, batting_rating)
batting_df

Unnamed: 0,player,batting_ranking,batting_rating
0,Steve Smith,1,911
1,Virat Kohli,2,886
2,Marnus Labuschagne,3,827
3,Kane Williamson,4,812
4,Babar Azam,5,798
...,...,...,...
95,Lahiru Thirimanne,96,329
96,Javed Ahmadi,97,326
97,Marcus Harris,98,323
98,Theunis de Bruyn,99,317


In [82]:
bowling_df = get_dataframe_from_ranking(bowling_outer_div, player, bowling_ranking, bowling_rating)
bowling_df

Unnamed: 0,player,bowling_ranking,bowling_rating
0,Pat Cummins,1,904
1,Stuart Broad,2,846
2,Neil Wagner,3,843
3,Tim Southee,4,812
4,Jason Holder,5,810
...,...,...,...
95,Azhar Ali,96,63
96,Shan Masood,97,37
97,Andy McBrine,98,35
98,Mominul Haque,99,33


In [83]:
all_rounder_df = get_dataframe_from_ranking(all_rounder_outer_div, player, all_rounder_ranking, all_rounder_rating)
all_rounder_df

Unnamed: 0,player,all_rounder_ranking,all_rounder_rating
0,Ben Stokes,1,454
1,Jason Holder,2,447
2,Ravindra Jadeja,3,397
3,Mitchell Starc,4,298
4,Ravichandran Ashwin,5,281
5,Colin de Grandhomme,6,280
6,Pat Cummins,7,266
7,Chris Woakes,7,266
8,Roston Chase,9,256
9,Stuart Broad,10,223


In [84]:
complete_df = all_rounder_df.merge(bowling_df, on=player).merge(batting_df, on=player)
complete_df

Unnamed: 0,player,all_rounder_ranking,all_rounder_rating,bowling_ranking,bowling_rating,batting_ranking,batting_rating
0,Ben Stokes,1,454,24,593,7,767
1,Jason Holder,2,447,5,810,42,553
2,Ravindra Jadeja,3,397,18,722,43,551
3,Mitchell Starc,4,298,7,797,86,375
4,Ravichandran Ashwin,5,281,12,756,88,372
5,Colin de Grandhomme,6,280,32,476,31,589
6,Chris Woakes,7,266,19,674,79,396
7,Roston Chase,9,256,31,505,53,508
8,Moeen Ali,12,188,28,521,92,362
9,Sam Curran,13,180,39,407,71,444


In [85]:
### Finding the function to get calculate an all-rounder's rating
X = complete_df[[bowling_rating, batting_rating]].values.astype(int)
Y = complete_df[all_rounder_rating].to_numpy().astype(int)

poly = PolynomialFeatures(interaction_only=True)

# Transform [x1, x2] to [1, x1, x2, x1 * x2]
X_ = poly.fit_transform(X)


clf = linear_model.LinearRegression()
clf.fit(X_, Y)

print("The coefficients a, b, c, d of <a * 1 + b * x1 + c * x2 + d * x1 * x2>: %s", clf.coef_)
clf.predict(X_)

The coefficients a, b, c, d of <a * 1 + b * x1 + c * x2 + d * x1 * x2>: %s [0.         0.0034911  0.00516932 0.00099088]


array([454.10454579, 446.91783358, 396.94938205, 298.25704588,
       280.61634885, 279.90050138, 266.25684766, 255.9763598 ,
       187.95925254, 180.16316336, 156.89871911])