<a href="https://colab.research.google.com/github/mifta10/web_scrapping/blob/main/Web_Scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the ICC Men's T20I Team Rankings page
url = 'https://en.m.wikipedia.org/wiki/ICC_Men%27s_T20I_Team_Rankings'

# Fetch the page
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the rankings table by searching for the wikitable class
# This selector might need adjustment if the page structure changes
table = soup.find('table', {'class': 'wikitable'})

# Initialize an empty list to hold all row data
data = []



In [18]:
# Extract data from each row of the table
for row in table.find_all('tr'):
    cols = row.find_all(['th', 'td'])  # This gets both headers and data cells
    cols = [ele.text.strip() for ele in cols]
    data.append(cols)

In [22]:
print(data)

[["ICC Men's T20I Team Rankings vte"], ['Rank', 'Team', 'Matches', 'Points', 'Rating'], ['1', 'India', '71', '18,867', '266'], ['2', 'New Zealand', '60', '15,387', '256'], ['3', 'England', '48', '12,305', '256'], ['4', 'Australia', '41', '10,349', '252'], ['5', 'Pakistan', '58', '14,454', '249'], ['6', 'South Africa', '37', '9,210', '249'], ['7', 'West Indies', '46', '11,201', '244'], ['8', 'Sri Lanka', '41', '9,574', '234'], ['9', 'Bangladesh', '46', '10,452', '227'], ['10', 'Afghanistan', '37', '7,443', '219'], ['11', 'Namibia', '28', '5,539', '198'], ['12', 'Ireland', '47', '9,117', '194'], ['13', 'Zimbabwe', '53', '10,222', '193'], ['14', 'Scotland', '18', '3,412', '190'], ['15', 'Netherlands', '19', '3,445', '181'], ['16', 'Nepal', '27', '4,796', '178'], ['17', 'United Arab Emirates', '38', '6,623', '174'], ['18', 'Oman', '24', '3,678', '153'], ['19', 'Papua New Guinea', '22', '3,173', '144'], ['20', 'Canada', '18', '2,528', '140'], ['21', 'Hong Kong', '28', '3,740', '134'], ['22'

In [25]:
 # Correct DataFrame to match the provided structure
# Ensuring the DataFrame matches expected columns: Rank, Team, Matches, Points, Rating
expected_columns = ["Rank", "Team", "Matches", "Points", "Rating"]
#Convert the list of data into a pandas DataFrame
df = pd.DataFrame(data, columns= expected_columns)  # The first row is the header

In [26]:
df

Unnamed: 0,Rank,Team,Matches,Points,Rating
0,ICC Men's T20I Team Rankings vte,,,,
1,Rank,Team,Matches,Points,Rating
2,1,India,71,18867,266
3,2,New Zealand,60,15387,256
4,3,England,48,12305,256
...,...,...,...,...,...
280,89,Mali,8,0,0
281,90,Croatia,10,0,0
282,91,Greece,6,0,0
283,"References: ICC T20I rankings, As of 12 Februa...",,,,


In [27]:
if not all(col in df.columns for col in expected_columns):
    print("DataFrame columns do not match expected structure. Please check the extracted data.")

# Save the DataFrame to a CSV file
csv_file_path = 'icc_mens_t20i_team_rankings.csv'
df.to_csv(csv_file_path, index=False)

print(f"Data extracted and saved to {csv_file_path} successfully.")

Data extracted and saved to icc_mens_t20i_team_rankings.csv successfully.


In [28]:
df_1 = pd.read_csv('/content/icc_mens_t20i_team_rankings.csv')

In [29]:
df

Unnamed: 0,Rank,Team,Matches,Points,Rating
0,ICC Men's T20I Team Rankings vte,,,,
1,Rank,Team,Matches,Points,Rating
2,1,India,71,18867,266
3,2,New Zealand,60,15387,256
4,3,England,48,12305,256
...,...,...,...,...,...
280,89,Mali,8,0,0
281,90,Croatia,10,0,0
282,91,Greece,6,0,0
283,"References: ICC T20I rankings, As of 12 Februa...",,,,


In [30]:
# Keep only rows where all values are strings
df = df[df.applymap(lambda x: isinstance(x, str)).all(axis=1)]

# Display the modified DataFrame
print(df)

     Rank         Team  Matches  Points  Rating
1    Rank         Team  Matches  Points  Rating
2       1        India       71  18,867     266
3       2  New Zealand       60  15,387     256
4       3      England       48  12,305     256
5       4    Australia       41  10,349     252
..    ...          ...      ...     ...     ...
278    87   Seychelles        7       0       0
279    88        Samoa        9       0       0
280    89         Mali        8       0       0
281    90      Croatia       10       0       0
282    91       Greece        6       0       0

[276 rows x 5 columns]


In [31]:
# Save the DataFrame to a CSV file
csv_file_path = 'icc_mens_t20i_team_rankings.csv'
df.to_csv(csv_file_path, index=False)

print(f"Data extracted and saved to {csv_file_path} successfully.")

Data extracted and saved to icc_mens_t20i_team_rankings.csv successfully.


In [32]:
# Remove duplicate rows
df = df.drop_duplicates()

# Display the modified DataFrame
print(df)

    Rank         Team  Matches  Points  Rating
1   Rank         Team  Matches  Points  Rating
2      1        India       71  18,867     266
3      2  New Zealand       60  15,387     256
4      3      England       48  12,305     256
5      4    Australia       41  10,349     252
..   ...          ...      ...     ...     ...
88    87   Seychelles        7       0       0
89    88        Samoa        9       0       0
90    89         Mali        8       0       0
91    90      Croatia       10       0       0
92    91       Greece        6       0       0

[92 rows x 5 columns]


In [33]:
# Save the DataFrame to a CSV file
csv_file_path = 'icc_mens_t20i_team_rankings_updated.csv'
df.to_csv(csv_file_path, index=False)

print(f"Data extracted and saved to {csv_file_path} successfully.")

Data extracted and saved to icc_mens_t20i_team_rankings_updated.csv successfully.
