# Scrapping/Collecting All Necessary Data

In [None]:
final_dataset_path = '/content/drive/MyDrive/DataMining/CP3_Datasets/'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/Kaggle/'

##1. Kaggle Dataset 2023 ICC Men's Cricket World Cup


The 2023 ICC Men's Cricket World Cup is the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament contested by men's national teams and organized by the International Cricket Council (ICC). The tournament is currently being hosted by India, commencing on 5 October and scheduled to conclude on 19 November 2023.

For more information, refer to the [Wikipedia page](https://en.wikipedia.org/wiki/2023_Cricket_World_Cup).

### Data Files

- **deliveries.csv**: Contains ball-by-ball data for all matches.
- **matches.csv**: Details for each match played.
- **points_table.csv**: Current tournament standings.

Feel free to explore the datasets to delve deeper into the exciting moments and statistics of the ongoing 2023 ICC Men's Cricket World Cup.


In [None]:
!kaggle datasets download -d pardeep19singh/icc-mens-world-cup-2023 --force

Downloading icc-mens-world-cup-2023.zip to /content
  0% 0.00/102k [00:00<?, ?B/s]
100% 102k/102k [00:00<00:00, 90.9MB/s]


In [None]:
import zipfile

zip_file_path = '/content/icc-mens-world-cup-2023.zip'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(final_dataset_path)

##2. ICC Men's Cricket Team Rankings

The latest rankings of international cricket teams based on points, ratings, and positions. The data is scraped from [ICC Cricket Rankings](https://www.icc-cricket.com/rankings/mens/team-rankings/odi).


### Dataset Information

The dataset, named **icc_rankings.csv**, includes the following key metrics for each team:

- **Team**: The name of the cricket team.
- **Points**: The total points earned by the team.
- **Ratings**: The team's current rating.
- **Position**: The team's position in the ICC rankings.


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
response = requests.get('https://www.icc-cricket.com/rankings/mens/team-rankings/odi')
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
data = []
table = soup.find('table', {'class': 'table'})
rows = table.find('tbody').find_all('tr')

for row in rows:
    cols = row.find_all(['td', 'th'])
    cols = [col.text.strip() for col in cols]
    data.append(cols)

icc_rankings = pd.DataFrame(data, columns=['Pos', 'Team', 'Matches', 'Points', 'Rating'])
icc_rankings['Team'] = icc_rankings['Team'].str.replace(r'\n.*', '', regex=True)

points_table = pd.read_csv(final_dataset_path+'points_table.csv')
Teams = list(points_table['Team'].unique())
icc_rankings =  icc_rankings[icc_rankings['Team'].isin(Teams)]

icc_rankings.to_csv(final_dataset_path+'icc_rankings.csv', index=False)


##3. Historical ODI Matches Data

The historical ODI matches dataset captures a range of information about cricket matches spanning from January 8, 2015, to May 14, 2023. The data includes key details for each match, offering insights into the dynamic world of One Day International cricket. Here's a brief overview of the data columns:

- **Date:** The date when the match took place.
- **Team_1 and Team_2:** The participating cricket teams in the match.
- **Winner:** The team that emerged victorious in the match.
- **Margin:** The margin of victory, providing details such as wickets taken, runs scored, or other relevant metrics.
- **Ground:** The venue where the match was held.

In [None]:
import zipfile
import os

# Path to the ZIP file
zip_file_path = '/content/drive/MyDrive/DataMining/odis_json.zip'

# Directory to extract the contents to
extracted_dir = '/content/odis_json/'

# Create the target directory if it doesn't exist
os.makedirs(extracted_dir, exist_ok=True)

# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir)


In [None]:
import os
import json
import pandas as pd

# Path to the directory containing JSON files
json_dir = '/content/odis_json'

# Initialize lists to store extracted data
dates_list = []
team_1_list = []
team_2_list = []
winner_list = []
margin_list = []
ground_list = []
city_list = []

# Iterate through each JSON file in the directory
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        # Read the JSON file
        with open(os.path.join(json_dir, filename), 'r') as file:
            data = json.load(file)

        # Extract data from the JSON structure
        date = data['info']['dates'][0]
        teams = data['info']['teams']
        # Check if the 'outcome' key is present
        if 'outcome' in data['info']:
            outcome = data['info']['outcome']

            # Check if the 'winner' key is present in the 'outcome' dictionary
            if 'winner' in outcome:
                winner = outcome['winner']
                margin_info = outcome.get('by', {})

                # Convert margin information to a string
                if 'wickets' in margin_info:
                    margin = f"{margin_info['wickets']} wickets"
                elif 'runs' in margin_info:
                    margin = f"{margin_info['runs']} runs"
                else:
                    margin = "N/A"
            else:
                winner = "N/A"
                margin = "N/A"
        else:
            winner = "N/A"
            margin = "N/A"

        # Extract ground information
        ground = data['info']['venue']
        if 'city' in data['info']:
          city = data['info']['city']
        else:
          city = 'N/A'

        # Append data to the lists
        if data['info']['gender'] == 'male':
          dates_list.append(date)
          team_1_list.append(teams[0])
          team_2_list.append(teams[1])
          winner_list.append(winner)
          margin_list.append(margin)
          ground_list.append(ground)
          city_list.append(city)

# Create a DataFrame from the lists
historical_matches = pd.DataFrame({
    'Date': dates_list,
    'Team_1': team_1_list,
    'Team_2': team_2_list,
    'Winner': winner_list,
    'Margin': margin_list,
    'Ground': ground_list,
    'City': city_list
})


In [None]:
# Taking Matches between 1-Jan-2015 to 31-October-2023

historical_matches['Date'] = pd.to_datetime(historical_matches['Date'])

# Define the date range
start_date = '2015-01-01'
end_date = '2023-09-30'

mask = (historical_matches['Date'] >= start_date) & (historical_matches['Date'] <= end_date)

historical_matches = historical_matches[mask]
historical_matches = historical_matches.sort_values(by='Date')
historical_matches = historical_matches.reset_index(drop=True)
historical_matches.to_csv(final_dataset_path+'historical_matches.csv', index=False)

##4. Cricket World Cup Fixtures Dataset

The Scheduled ODI matches dataset captures a range of information about cricket matches scheduled For ICC Men's cricket Worldcup 2023. Here's a brief overview of the data columns:

- **Date:** The date when the match took place.
- **Team_1 and Team_2:** The participating cricket teams in the match.


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
response = requests.get('https://www.cricketworldcup.com/fixtures')
soup = BeautifulSoup(response.text, 'html.parser')
all_matches = soup.find('div',class_='match-list__wrapper js-matchlist')
matches_list = all_matches.find_all('div', class_='match-block')
len(matches_list)

48

In [None]:
Team_1s = []
Team_2s = []
Date = []
Ground = []
Venues = []
Cities = []

for m in matches_list:
  # Extracting information
  teams = m.select('.match-block__team-name')
  team_1 = teams[0].text.strip()
  team_2 = teams[1].text.strip()
  Team_1s.append(team_1)
  Team_2s.append(team_2)

  # Extracting date and ground with additional checks
  date_element = m.select_one('.match-block__date-user')
  date = date_element['data-startdate'] if date_element else None
  Date.append(date)

  venue = m.select('.match-block__venue-name')[0].text
  Venues.append(venue)
  city = venue.split(',')[-1].strip()
  Cities.append(city)

# Creating a DataFrame
data = {
    'Team_1': Team_1s,
    'Team_2': Team_2s,
    'Date': Date,
    'Venue':Venues,
    'City':Cities
}

fixtures = pd.DataFrame(data)
# for index, row in fixtures.iterrows():
#   if row['Team_2']=='Sri Lanka':
#     row['Team_1'], row['Team_2'] = row['Team_2'], row['Team_1']
#     break

# for index, row in fixtures.iterrows():
#   if row['Team_1']=='India':
#     row['Team_1'], row['Team_2'] = row['Team_2'], row['Team_1']
#     break
fixtures.to_csv(final_dataset_path+'fixtures.csv', index=False)
fixtures

Unnamed: 0,Team_1,Team_2,Date,Venue,City
0,England,New Zealand,2023-10-05T14:00:00+0530,"Narendra Modi Stadium, Ahmedabad",Ahmedabad
1,Pakistan,Netherlands,2023-10-06T14:00:00+0530,"Rajiv Gandhi International Stadium, Hyderabad",Hyderabad
2,Bangladesh,Afghanistan,2023-10-07T10:30:00+0530,"HPCA Stadium, Dharamsala",Dharamsala
3,Sri Lanka,South Africa,2023-10-07T14:00:00+0530,"Arun Jaitley Stadium, Delhi",Delhi
4,Australia,India,2023-10-08T14:00:00+0530,"MA Chidambaram Stadium, Chennai",Chennai
5,New Zealand,Netherlands,2023-10-09T14:00:00+0530,"Rajiv Gandhi International Stadium, Hyderabad",Hyderabad
6,England,Bangladesh,2023-10-10T10:30:00+0530,"HPCA Stadium, Dharamsala",Dharamsala
7,Pakistan,Sri Lanka,2023-10-10T14:00:00+0530,"Rajiv Gandhi International Stadium, Hyderabad",Hyderabad
8,India,Afghanistan,2023-10-11T14:00:00+0530,"Arun Jaitley Stadium, Delhi",Delhi
9,Australia,South Africa,2023-10-12T14:00:00+0530,"BRSABVE Cricket Stadium, Lucknow",Lucknow


##5. Cricket World Cup 2023 Dataset

The Scheduled ODI matches dataset captures a range of information about cricket matches scheduled For ICC Men's cricket Worldcup 2023 along with their result. Here's a brief overview of the data columns:

- **Date:** The date when the match took place.
- **Team_1 and Team_2:** The participating cricket teams in the match.
- **Winner** Winner of the match.


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
response = requests.get('https://www.cricketworldcup.com/fixtures')
soup = BeautifulSoup(response.text, 'html.parser')
all_matches = soup.find('div',class_='match-list__wrapper js-matchlist')
matches_list = all_matches.find_all('div', class_='match-block')
len(matches_list)

48

In [None]:
Team_1s = []
Team_2s = []
Date = []
Ground = []
Venues = []
Cities = []
Winner = []

for m in matches_list:
  # Extracting information
  teams = m.select('.match-block__team-name')
  team_1 = teams[0].text.strip()
  team_2 = teams[1].text.strip()
  Team_1s.append(team_1)
  Team_2s.append(team_2)

  # Extracting date and ground with additional checks
  date_element = m.select_one('.match-block__date-user')
  date = date_element['data-startdate'] if date_element else None
  Date.append(date)

  venue = m.select('.match-block__venue-name')[0].text
  Venues.append(venue)
  city = venue.split(',')[-1].strip()
  Cities.append(city)

  WinnerBlk = m.select('.match-block__header')[0]
  WinnerSpan = WinnerBlk.select('.match-block__header-bold')
  if len(WinnerSpan)>0:
    Winner.append(WinnerSpan[0].text.strip())
  else:
    Winner.append('NaN')

  # print(WinnerSpan)

# Creating a DataFrame
data = {
    'Team_1': Team_1s,
    'Team_2': Team_2s,
    'Date': Date,
    'Venue':Venues,
    'City':Cities,
    'Winner':Winner
}

result23 = pd.DataFrame(data)
for index, row in fixtures.iterrows():
  if row['Team_2']=='Sri Lanka':
    row['Team_1'], row['Team_2'] = row['Team_2'], row['Team_1']
    break

for index, row in fixtures.iterrows():
  if row['Team_1']=='India':
    row['Team_1'], row['Team_2'] = row['Team_2'], row['Team_1']
    break
result23.to_csv(final_dataset_path+'result23.csv', index=False)
result23

Unnamed: 0,Team_1,Team_2,Date,Venue,City,Winner
0,England,New Zealand,2023-10-05T14:00:00+0530,"Narendra Modi Stadium, Ahmedabad",Ahmedabad,New Zealand
1,Pakistan,Netherlands,2023-10-06T14:00:00+0530,"Rajiv Gandhi International Stadium, Hyderabad",Hyderabad,Pakistan
2,Bangladesh,Afghanistan,2023-10-07T10:30:00+0530,"HPCA Stadium, Dharamsala",Dharamsala,Bangladesh
3,South Africa,Sri Lanka,2023-10-07T14:00:00+0530,"Arun Jaitley Stadium, Delhi",Delhi,South Africa
4,India,Australia,2023-10-08T14:00:00+0530,"MA Chidambaram Stadium, Chennai",Chennai,India
5,New Zealand,Netherlands,2023-10-09T14:00:00+0530,"Rajiv Gandhi International Stadium, Hyderabad",Hyderabad,New Zealand
6,England,Bangladesh,2023-10-10T10:30:00+0530,"HPCA Stadium, Dharamsala",Dharamsala,England
7,Pakistan,Sri Lanka,2023-10-10T14:00:00+0530,"Rajiv Gandhi International Stadium, Hyderabad",Hyderabad,Pakistan
8,India,Afghanistan,2023-10-11T14:00:00+0530,"Arun Jaitley Stadium, Delhi",Delhi,India
9,Australia,South Africa,2023-10-12T14:00:00+0530,"BRSABVE Cricket Stadium, Lucknow",Lucknow,South Africa


In [None]:
result23[['Team_1', 'Team_2', 'City', 'Winner']][:44].head(3)

Unnamed: 0,Team_1,Team_2,City,Winner
0,England,New Zealand,Ahmedabad,New Zealand
1,Pakistan,Netherlands,Hyderabad,Pakistan
2,Bangladesh,Afghanistan,Dharamsala,Bangladesh


##6. Cricket World Cup 2023 Squad Collection

Gathering team-specific squads to formulate an 11-player forecast for the Cricket World Cup 2023 finalists. This involves a comprehensive analysis of each team's selected players, considering their individual strengths, playing styles, and recent performances, to predict the optimal lineup for the potential finalists in the upcoming tournament.


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
url = 'https://en.wikipedia.org/wiki/2023_Cricket_World_Cup_squads'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
all_teams = soup.find_all('table',class_='sortable')
all_teams_names = soup.find_all('span',class_='mw-headline')[1:11]
all_teams_names = [i.text for i in all_teams_names]
# teams_list = all_teams.find_all('a', class_='team-index__link')
print(all_teams_names)
print(len(all_teams))
# all_teams[0]

['Afghanistan', 'Australia', 'Bangladesh', 'England', 'India', 'Netherlands', 'New Zealand', 'Pakistan', 'South Africa', 'Sri Lanka']
10


In [None]:
# from io import StringIO

# for span in all_teams[0].find_all('span'):
#   span.unwrap()

# html_io = StringIO(str(all_teams[0]))

# df_list = pd.read_html(html_io)

# # Assume that the table of interest is the first one in the list
# df = df_list[0]

# df['Team'] = all_teams_names[0]
# # Display the DataFrame
# df.head()

Unnamed: 0,S/N,Player,Date of birth (age),ODIs,Role,Batting,Bowling style,List A or domestic team,Team
0,50,Hashmatullah Shahidi (c),(1994-11-04)4 November 1994 (aged 28),64,Batsman,Left,Right-arm off break,Band-e-Amir Region,Afghanistan
1,15,Noor Ahmad,(2005-01-03)3 January 2005 (aged 18),3,Bowler,Right,Left-arm unorthodox spin,Mis Ainak Knights,Afghanistan
2,46,Ikram Alikhil,(2000-11-28)28 November 2000 (aged 22),14,Wicket-keeper,Right,–,Speenghar Tigers,Afghanistan
3,5,Fazalhaq Farooqi,(2000-09-22)22 September 2000 (aged 23),21,Bowler,Right,Left-arm fast-medium,Kabul Eagles,Afghanistan
4,21,Rahmanullah Gurbaz (wk),(2001-11-28)28 November 2001 (aged 21),26,Wicket-keeper,Left,–,Kabul Eagles,Afghanistan
5,76,Riaz Hassan,(2002-11-07)7 November 2002 (aged 20),5,Batsman,Right,–,Speenghar Tigers,Afghanistan
6,19,Rashid Khan,(1998-09-20)20 September 1998 (aged 25),94,All-rounder,Right,Right-arm leg spin,Speenghar Tigers,Afghanistan
7,7,Mohammad Nabi,(1985-01-01)1 January 1985 (aged 38),147,All-rounder,Right,Right-arm off spin,Band-e-Amir Dragons,Afghanistan
8,9,Azmatullah Omarzai,(2000-03-24)24 March 2000 (aged 23),13,All-rounder,Right,Right-arm medium-fast,Mis Ainak Knights,Afghanistan
9,27,Abdul Rahman,(2001-11-22)22 November 2001 (aged 21),3,Bowler,Right,Right-arm medium-fast,Band-e-Amir Dragons,Afghanistan


In [None]:
from io import StringIO

Squads = pd.DataFrame()

for index in range(len(all_teams)):

  for span in all_teams[index].find_all('span'):
    span.unwrap()

  html_io = StringIO(str(all_teams[index]))
  df_list = pd.read_html(html_io)

  curr_df = df_list[0]

  curr_df['Team'] = all_teams_names[index]

  Squads = pd.concat([Squads, curr_df], ignore_index=True)


In [None]:
print(Squads.shape)
Squads

(162, 9)


Unnamed: 0,S/N,Player,Date of birth (age),ODIs,Role,Batting,Bowling style,List A or domestic team,Team
0,50.0,Hashmatullah Shahidi (c),(1994-11-04)4 November 1994 (aged 28),64,Batsman,Left,Right-arm off break,Band-e-Amir Region,Afghanistan
1,15.0,Noor Ahmad,(2005-01-03)3 January 2005 (aged 18),3,Bowler,Right,Left-arm unorthodox spin,Mis Ainak Knights,Afghanistan
2,46.0,Ikram Alikhil,(2000-11-28)28 November 2000 (aged 22),14,Wicket-keeper,Right,–,Speenghar Tigers,Afghanistan
3,5.0,Fazalhaq Farooqi,(2000-09-22)22 September 2000 (aged 23),21,Bowler,Right,Left-arm fast-medium,Kabul Eagles,Afghanistan
4,21.0,Rahmanullah Gurbaz (wk),(2001-11-28)28 November 2001 (aged 21),26,Wicket-keeper,Left,–,Kabul Eagles,Afghanistan
...,...,...,...,...,...,...,...,...,...
157,61.0,Maheesh Theekshana,(2000-08-01)1 August 2000 (aged 23),27,Bowler,Right,Right-arm off spin,Colts,Sri Lanka
158,1.0,Dunith Wellalage,(2003-01-09)9 January 2003 (aged 20),15,All-rounder,Left,Slow left-arm orthodox,Colts,Sri Lanka
159,7.0,Dasun Shanaka (c),(1991-09-09)9 September 1991 (aged 32),67,All-rounder,Right,Right-arm medium,Sinhalese,Sri Lanka
160,81.0,Matheesha Pathirana,(2002-12-18)18 December 2002 (aged 20),10,Bowler,Right,Right-arm fast,Nondescripts,Sri Lanka


In [None]:
Squads.to_csv(final_dataset_path+'Squads.csv', index=False)

In [None]:
team_counts = Squads.groupby('Team').size().reset_index(name='Count')

team_counts

Unnamed: 0,Team,Count
0,Afghanistan,15
1,Australia,16
2,Bangladesh,16
3,England,16
4,India,17
5,Netherlands,16
6,New Zealand,16
7,Pakistan,15
8,South Africa,17
9,Sri Lanka,18


In [None]:
Squads[Squads['Team']=='South Africa']

Unnamed: 0,S/N,Player,Date of birth (age),ODIs,Role,Batting,Bowling style,List A or domestic team,Team
127,11.0,Temba Bavuma (c),(1990-05-17)17 May 1990 (aged 33),28,Batsman,Right,Right-arm medium,Lions,South Africa
128,62.0,Gerald Coetzee,(2000-10-02)2 October 2000 (aged 23),6,Bowler,Right,Right-arm fast,Knights,South Africa
129,12.0,Quinton de Kock (wk),(1992-12-17)17 December 1992 (aged 30),145,Wicket-keeper,Left,–,Titans,South Africa
130,17.0,Reeza Hendricks,(1989-08-14)14 August 1989 (aged 34),29,Batsman,Right,Right-arm off break,Lions,South Africa
131,70.0,Marco Jansen,(2000-05-01)1 May 2000 (aged 23),14,All-rounder,Right,Left-arm fast-medium,Warriors,South Africa
132,45.0,Heinrich Klaasen (wk),(1991-07-30)30 July 1991 (aged 32),41,Wicket-keeper,Right,Right-arm off spin,Titans,South Africa
133,16.0,Keshav Maharaj,(1990-02-07)7 February 1990 (aged 33),31,Bowler,Right,Slow left-arm orthodox,Dolphins,South Africa
134,4.0,Aiden Markram,(1994-10-04)4 October 1994 (aged 29),55,Batsman,Right,Right-arm off spin,Titans,South Africa
135,10.0,David Miller,(1989-06-10)10 June 1989 (aged 34),160,Batsman,Left,Right-arm off spin,Dolphins,South Africa
136,22.0,Lungi Ngidi,(1996-03-29)29 March 1996 (aged 27),48,Bowler,Right,Right-arm fast-medium,Titans,South Africa


# Points Table

The point table typically consists of columns displaying:

1.   Team Names
1.   Matches Played
1.   Matches Won
1.   Matches Lost
1.   Matches Tied or No Result (if applicable)
2.   Points Earned
7.   Net Run Rate (NRR)

## Points System:

*   **Win**: Teams receive 2 points for a win.
*   **Tie/No Result**: In case of a tie or a match abandoned due to weather, teams might share points or get one point each.




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

In [None]:
response = requests.get('https://www.cricbuzz.com/cricket-series/6732/icc-cricket-world-cup-2023/points-table')
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', class_='table cb-srs-pnts')

In [None]:
head = table.find('thead').find('tr')
cols = head.find_all('td')
cols = [tag.text for tag in cols]
cols
pts = head.find_all('th')
pts


[<th class="cb-srs-pnts-th text-left" style="padding-left: 6px;width:200px;">Teams</th>,
 <th class="cb-srs-pnts-th" title="Match Points">Pts</th>,
 <th></th>]

In [None]:
body = table.find('tbody')
rows = body.find_all('tr', class_="")
rows = rows[::13]
len(rows)

10

In [None]:
rows[6]

<tr><th class="text-left">Opponent</th><th class="text-left">Description</th><th class="text-left">Date</th><th class="text-left">Result</th></tr>

In [None]:
point_tble = []

for inst in range(len(rows)):
  instance = {}
  features = rows[inst].find_all('td')
  print(features)

# features =
# instance['Team'] = 'India'
# for f in range(len(cols)):
#   row[0].find_all


[<td class="cb-srs-pnts-name" style="width:200px;"><a class="cb-text-link" href="/cricket-team/india/2"><div class="cb-col cb-col-100"><div class="cb-col cb-col-16"><img height="18" src="/a/img/v1/24x18/i1/c172115/india.jpg" style="padding:0px;" title="India" width="24"/></div><div class="cb-col cb-col-84">India<span class="cb-text-gray cb-font-10">  (Q)</span></div></div></a></td>, <td class="cb-srs-pnts-td">9</td>, <td class="cb-srs-pnts-td">9</td>, <td class="cb-srs-pnts-td">0</td>, <td class="cb-srs-pnts-td">0</td>, <td class="cb-srs-pnts-td">0</td>, <td class="cb-srs-pnts-td text-bold">18</td>, <td class="cb-srs-pnts-td">+2.570</td>, <td ng-init="direction_2_teams='up'"><a class="cb-srs-pnts-a cb-cursor" ng-click="(direction_2_teams=='up')?direction_2_teams = 'down' : direction_2_teams = 'up'" ng-cloak=""><span ng-class="(direction_2_teams=='up') ? 'cb-caret-down' : 'cb-caret-up'"></span></a></td>]
[]
[<td class="cb-srs-pnts-name" style="width:200px;"><a class="cb-text-link" href=

In [None]:
point = pd.read_excel(final_dataset_path+'Final_Point_table.xlsx')
point

Unnamed: 0,Team,Mat,Won,Lost,Tied,NR,Pts,NRR
0,India,9,9,0,0,0,18,2.57
1,South Africa,9,7,2,0,0,14,1.261
2,Australia,9,7,2,0,0,14,0.841
3,New Zealand,9,5,4,0,0,10,0.743
4,Pakistan,9,4,5,0,0,8,-0.199
5,Afganistan,9,4,5,0,0,8,-0.336
6,England,9,3,6,0,0,6,-0.572
7,Bangladesh,9,2,7,0,0,4,-1.087
8,Sri Lanka,9,2,7,0,0,4,-1.419
9,Netherlands,9,2,7,0,0,4,-1.825


In [None]:
point.to_csv(final_dataset_path+'FinalPointTable.csv',index=False)

##7.  Play and Win Score on Ground

In [None]:
historical_matches = pd.read_csv(datasets_path+'historical_matches.csv')
historical_matches.head()

Unnamed: 0,Date,Team_1,Team_2,Winner,Margin,Ground,City
0,2015-01-08,Afghanistan,Scotland,Afghanistan,8 wickets,ICC Academy,Dubai
1,2015-01-11,New Zealand,Sri Lanka,New Zealand,3 wickets,Hagley Oval,Christchurch
2,2015-01-12,Ireland,Scotland,Ireland,3 wickets,Dubai International Cricket Stadium,
3,2015-01-14,Afghanistan,Scotland,Scotland,150 runs,Sheikh Zayed Stadium,Abu Dhabi
4,2015-01-15,New Zealand,Sri Lanka,Sri Lanka,6 wickets,Seddon Park,Hamilton


In [None]:
historical_matches.shape

(1058, 7)

In [None]:
mask = historical_matches['City'].isin(worldcup_cities)

req_city_data = historical_matches[mask]

# Create a new DataFrame with the desired grouping
city_match_count = req_city_data.groupby('City').size().reset_index(name='Match_Count')
print(sum(city_match_count['Match_Count']))
# Display the new DataFrame
print(city_match_count)

33
         City  Match_Count
0   Ahmedabad            3
1   Bengaluru            2
2     Chennai            4
3       Delhi            3
4  Dharamsala            1
5   Hyderabad            2
6     Kolkata            3
7     Lucknow            4
8      Mumbai            5
9        Pune            6


In [None]:
req_city_data

req_data = req_city_data[['Team_1', 'Team_2', 'City', 'Winner']].reset_index(drop=True)
req_data

result23 = pd.read_csv(datasets_path+'result23.csv')
result23 = result23[['Team_1', 'Team_2', 'City', 'Winner']].reset_index(drop=True)[:45]
result23
final_ground_stats = pd.concat([req_data, result23])


final_ground_stats.head()

Unnamed: 0,Team_1,Team_2,City,Winner
0,India,South Africa,Chennai,India
1,India,South Africa,Mumbai,South Africa
2,India,New Zealand,Delhi,New Zealand
3,India,England,Pune,India
4,India,England,Kolkata,England


In [None]:
req_team1_data = final_ground_stats.groupby('Team_1').size().reset_index(name='Match_Count')
req_team2_data = final_ground_stats.groupby('Team_2').size().reset_index(name='Match_Count')

In [None]:
print(req_team1_data)
print(req_team2_data)

          Team_1  Match_Count
0    Afghanistan            4
1      Australia           10
2     Bangladesh            2
3        England            8
4          India           30
5    Netherlands            3
6    New Zealand            7
7       Pakistan            5
8   South Africa            6
9      Sri Lanka            1
10   West Indies            2
          Team_2  Match_Count
0    Afghanistan            8
1      Australia            8
2     Bangladesh            7
3        England            6
4          India            9
5    Netherlands            6
6    New Zealand            6
7       Pakistan            4
8   South Africa            7
9      Sri Lanka           10
10   West Indies            7


In [None]:
cities = list(req_city_data['City'].unique())
teams = list(points_table['Team'].unique())

In [None]:
GroundPlayScore = []
for city in cities:
  instance = {}
  instance['Ground'] = city
  for team in teams:
    t1 = (final_ground_stats['Team_1'] == team) & (final_ground_stats['City'] == city)
    t2 = (final_ground_stats['Team_2'] == team) & (final_ground_stats['City'] == city)
    # w = (req_city_data['Winner'] == team) & (req_city_data['City'] == city)
    play_score = (t1 | t2).sum()
    instance[team] = play_score
  GroundPlayScore.append(instance)

In [None]:
pd.DataFrame(GroundPlayScore)

Unnamed: 0,Ground,South Africa,India,Australia,New Zealand,Pakistan,Afghanistan,Sri Lanka,Netherlands,Bangladesh,England
0,Chennai,2,5,3,2,2,2,0,0,1,0
1,Mumbai,3,6,3,1,0,1,1,0,1,1
2,Delhi,2,4,2,1,0,2,2,1,1,1
3,Pune,1,7,1,2,0,1,1,1,2,5
4,Kolkata,1,4,1,0,2,0,1,1,2,2
5,Bengaluru,0,3,3,2,2,0,2,1,0,1
6,Dharamsala,1,2,1,2,0,1,1,1,2,1
7,Hyderabad,0,2,1,2,2,0,1,2,0,0
8,Lucknow,2,2,2,0,0,4,2,2,0,1
9,Ahmedabad,1,4,1,1,1,1,0,0,0,2


In [None]:
GroundWinScore = []
for city in cities:
  instance = {}
  instance['Ground'] = city
  for team in teams:
    t1 = (final_ground_stats['Team_1'] == team) & (final_ground_stats['City'] == city)
    t2 = (final_ground_stats['Team_2'] == team) & (final_ground_stats['City'] == city)
    w = ((final_ground_stats['Winner'] == team) & (final_ground_stats['City'] == city)).sum()
    t = (t1 | t2).sum()
    if t > 0:
      win_score = w / t
    else:
      win_score = 0.33
    instance[team] = '%.2f' % (win_score)
  GroundWinScore.append(instance)

In [None]:
GroundWinScore = pd.DataFrame(GroundWinScore)
GroundWinScore.to_csv(datasets_path+'GroundWinScore.csv', index=False)

In [None]:
GroundWinScore

Unnamed: 0,Ground,South Africa,India,Australia,New Zealand,Pakistan,Afghanistan,Sri Lanka,Netherlands,Bangladesh,England
0,Chennai,0.5,0.6,0.33,1.0,0.0,0.5,0.33,0.33,0.0,0.33
1,Mumbai,1.0,0.5,0.67,1.0,0.33,0.0,0.0,0.33,0.0,0.0
2,Delhi,0.5,0.5,1.0,1.0,0.33,0.5,0.0,0.0,1.0,0.0
3,Pune,1.0,0.71,1.0,0.0,0.33,1.0,0.0,0.0,0.0,0.4
4,Kolkata,0.0,0.75,0.0,0.33,0.5,0.33,0.0,1.0,0.0,1.0
5,Bengaluru,0.33,0.67,0.67,0.5,0.5,0.33,0.5,0.0,0.33,0.0
6,Dharamsala,0.0,0.5,1.0,0.0,0.33,0.0,1.0,1.0,0.5,1.0
7,Hyderabad,0.33,1.0,0.0,0.5,1.0,0.33,0.0,0.0,0.33,0.33
8,Lucknow,1.0,0.5,0.5,0.33,0.33,0.25,0.5,0.0,0.33,0.0
9,Ahmedabad,1.0,1.0,1.0,1.0,0.0,0.0,0.33,0.33,0.33,0.0
