In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime, timedelta

# Function to get game results for a specific date
def get_game_results(year, month, day):
    url = f"https://www.ncaa.com/scoreboard/soccer-men/d1/{year}/{month:02d}/{day:02d}/all-conf"
    print(f"Fetching data from: {url}")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {year}-{month:02d}-{day:02d}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    games = []

    # Find game containers
    for game in soup.find_all("div", class_="gamePod"):
        try:
            date = f"{year}-{month:02d}-{day:02d}"
            status = game.find("div", class_="gamePod-status").text.strip()  # e.g., "FINAL"
            round_info = game.find("span", class_="game-round")
            round_name = round_info.text.strip() if round_info else "Unknown Round"

            teams = game.find_all("span", class_="gamePod-game-team-name")
            scores = game.find_all("span", class_="gamePod-game-team-score")
            ranks = game.find_all("span", class_="gamePod-game-team-rank")

            team1 = teams[0].text.strip() if len(teams) > 0 else "Unknown"
            team2 = teams[1].text.strip() if len(teams) > 1 else "Unknown"
            score1 = scores[0].text.strip() if len(scores) > 0 else "N/A"
            score2 = scores[1].text.strip() if len(scores) > 1 else "N/A"
            rank1 = ranks[0].text.strip() if len(ranks) > 0 else "N/A"
            rank2 = ranks[1].text.strip() if len(ranks) > 1 else "N/A"

            winner_class = game.find_all("li", class_="winner")
            winner = winner_class[0].find("span", class_="gamePod-game-team-name").text.strip() if winner_class else "N/A"

            games.append([date, round_name, status, team1, rank1, score1, team2, rank2, score2, winner])

        except Exception as e:
            print(f"Error parsing game data for {date}: {e}")

    return games

# Set the range of years
START_YEAR = 2014
END_YEAR = 2024

# Loop through each year (only August–December)
for year in range(START_YEAR, END_YEAR + 1):
    all_games = []
    start_date = datetime(year, 8, 1)  # August 1st
    end_date = datetime(year, 12, 31)  # December 31st
    current_date = start_date

    while current_date <= end_date:
        month, day = current_date.month, current_date.day
        games_data = get_game_results(year, month, day)

        if games_data:
            all_games.extend(games_data)

        # Respectful scraping: delay between requests
        time.sleep(1.5)

        # Move to the next day
        current_date += timedelta(days=1)

    # Save data for the year (if any games were found)
    if all_games:
        df = pd.DataFrame(all_games, columns=["Date", "Round", "Status", "Team 1", "Rank 1", "Score 1", "Team 2", "Rank 2", "Score 2", "Winner"])
        filename = f"ncaa_womens_soccer_results_{year}.csv"
        df.to_csv(filename, index=False)
        print(f"Data for {year} (Aug–Dec) saved to {filename}")
    else:
        print(f"No games found for {year} (Aug–Dec)")

print("Scraping completed for the last 10 seasons (August–December).")

Fetching data from: https://www.ncaa.com/scoreboard/soccer-men/d1/2014/08/01/all-conf


KeyboardInterrupt: 

In [1]:
import math

class EloRatingSystem:
    def __init__(self, k=32, initial_rating=1500):
        self.k = k  # Standard Elo adjustment factor
        self.ratings = {}  # Dictionary to store team ratings
        self.initial_rating = initial_rating  # Default initial Elo rating


    def expected_score(self, rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))

    def update_ratings(self, team1, team2, score1, score2):
        if team1 not in self.ratings:
            self.ratings[team1] = self.initial_rating
        if team2 not in self.ratings:
            self.ratings[team2] = self.initial_rating

        rating1, rating2 = self.ratings[team1], self.ratings[team2]
        exp_score1 = self.expected_score(rating1, rating2)
        exp_score2 = self.expected_score(rating2, rating1)

        actual_score1 = 1 if score1 > score2 else 0.5 if score1 == score2 else 0
        actual_score2 = 1 - actual_score1

        # Margin of Victory Multiplier
        mov = abs(score1 - score2)
        mov_multiplier = math.log(mov + 1) * (2.2 / (1 + 0.001 * abs(rating1 - rating2)))

        # Update ratings
        self.ratings[team1] += self.k * mov_multiplier * (actual_score1 - exp_score1)
        self.ratings[team2] += self.k * mov_multiplier * (actual_score2 - exp_score2)

    def normalize_end_of_season(self):
        """Applies regression to the mean to avoid long-term inflation/deflation."""
        for team in self.ratings:
            if self.ratings[team] > self.initial_rating:
                self.ratings[team] -= (self.ratings[team] - self.initial_rating) * 0.25
            else:
                self.ratings[team] += (self.initial_rating - self.ratings[team]) * 0.25

    def get_ratings(self):
        return self.ratings







In [2]:
from google.colab import files

# This will prompt you to upload files
uploaded = files.upload()

# Displaying the names of the uploaded files
for filename in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=filename, length=len(uploaded[filename])))




MessageError: RangeError: Maximum call stack size exceeded.

In [None]:
elo_system = EloRatingSystem()


In [None]:
import pandas as pd
import io

# Assuming 'uploaded' is the dictionary returned by files.upload() containing the content of the files
all_data = []
for filename, content in uploaded.items():
    df = pd.read_csv(io.BytesIO(content))  # Read each file from memory
    all_data.append(df)

# Concatenate all the DataFrames into one DataFrame
combined_data = pd.concat(all_data, ignore_index=True)
combined_data['Date'] = pd.to_datetime(combined_data['Date'])  # Convert 'Date' to datetime
combined_data.sort_values('Date', inplace=True)  # Sort the data by date

# Show the first few rows to confirm
print(combined_data.head())


            Date          Round Status            Team 1  Rank 1  Score 1  \
17770 2014-08-29  Unknown Round  Final       Georgia St.     NaN      0.0   
17802 2014-08-29  Unknown Round  Final    Incarnate Word     NaN      0.0   
17801 2014-08-29  Unknown Round  Final            DePaul     NaN      1.0   
17800 2014-08-29  Unknown Round  Final          Campbell     NaN      1.0   
17799 2014-08-29  Unknown Round  Final  St. Francis (NY)     NaN      0.0   

               Team 2  Rank 2  Score 2          Winner  
17770          Mercer     NaN      2.0          Mercer  
17802    Missouri St.     NaN      1.0    Missouri St.  
17801       Milwaukee     NaN      2.0       Milwaukee  
17800  UNC Greensboro     NaN      0.0        Campbell  
17799  Saint Joseph's     NaN      1.0  Saint Joseph's  


In [None]:
import pandas as pd
import difflib

# Standardized names mapping
name_corrections = {
    "Mich. St.", "Michigan St.", "S.C. Upstate", "USC Upstate" "UC Santa Barbara", "UC Santa Barb.",
    "San Diego St.", "San Diego State", "NC State", "North Carolina St.",
    "St. Mary's (Cal.)", "Saint Mary's (CA)", "LIU Brooklyn", "LIU", "Long Island",
    "South Fla.", "South Florida", "FDU", "Fairleigh Dickinson",
    "St. Francis (NY)", "St. Francis (PA)", "St. Francis (Pa.)", "Saint Francis (PA)"

}


# Apply corrections directly to the existing columns
combined_data["Team 1"] = combined_data["Team 1"].replace(name_corrections)
combined_data["Team 2"] = combined_data["Team 2"].replace(name_corrections)

# Display the corrected dataset
print(combined_data)

            Date          Round      Status            Team 1  Rank 1  \
17770 2014-08-29  Unknown Round       Final       Georgia St.     NaN   
17802 2014-08-29  Unknown Round       Final    Incarnate Word     NaN   
17801 2014-08-29  Unknown Round       Final            DePaul     NaN   
17800 2014-08-29  Unknown Round       Final          Campbell     NaN   
17799 2014-08-29  Unknown Round       Final  St. Francis (NY)     NaN   
...          ...            ...         ...               ...     ...   
1933  2024-12-07  Quarterfinals       FINAL     Massachusetts     NaN   
1934  2024-12-08  Quarterfinals       FINAL          Marshall    13.0   
1935  2024-12-13     Semifinals       FINAL          Marshall    13.0   
1936  2024-12-13     Semifinals    FINAL/PK           Vermont     NaN   
1937  2024-12-16   Championship  FINAL (OT)           Vermont     NaN   

       Score 1          Team 2  Rank 2  Score 2          Winner  
17770      0.0          Mercer     NaN      2.0          

  combined_data["Team 1"] = combined_data["Team 1"].replace(name_corrections)
  combined_data["Team 2"] = combined_data["Team 2"].replace(name_corrections)


In [None]:
# Update ratings based on the game results
for index, row in combined_data.iterrows():
    if pd.notna(row['Score 1']) and pd.notna(row['Score 2']):
        score1, score2 = int(row['Score 1']), int(row['Score 2'])
        elo_system.update_ratings(row['Team 1'], row['Team 2'], score1, score2)

# Extract and display the final ratings
final_ratings = pd.DataFrame(list(elo_system.get_ratings().items()), columns=['Team', 'Rating']).sort_values(by='Rating', ascending=False)
print(final_ratings)



                  Team       Rating
165            Clemson  2131.869570
35            Ohio St.  2046.015726
67            Marshall  2018.390291
134            Vermont  1989.962296
94            Stanford  1984.592343
..                 ...          ...
252   Mount St. Mary's  1040.935959
32              Howard  1037.539624
164            Radford  1023.065880
154           Hartford  1022.248198
131  Central Conn. St.   945.242588

[479 rows x 2 columns]


In [None]:
def display_elo_ratings(elo_system):
    # Ensure the ratings are sorted by Elo rating value. This sorts from highest to lowest by default.
    sorted_ratings = sorted(elo_system.ratings.items(), key=lambda x: x[1], reverse=True)

    # Print the sorted ratings
    print("Team Names and Their Elo Ratings:")
    for team, rating in sorted_ratings:
        print(f"{team}: {rating:.2f}")


display_elo_ratings(elo_system)


Team Names and Their Elo Ratings:
Clemson: 2131.87
Ohio St.: 2046.02
Marshall: 2018.39
Vermont: 1989.96
Stanford: 1984.59
Indiana: 1976.07
Denver: 1944.81
Western Mich.: 1943.57
Duke: 1942.19
SMU: 1939.27
Wake Forest: 1931.31
North Carolina: 1930.73
New Hampshire: 1926.52
Georgetown: 1921.91
Dayton: 1916.59
Notre Dame: 1913.15
West Virginia: 1902.09
Akron: 1901.19
Portland: 1899.76
Pittsburgh: 1899.43
San Diego: 1888.80
Virginia: 1888.58
Oregon St.: 1886.76
St. Mary's (Cal.): 1857.82
Washington: 1855.33
Hofstra: 1846.01
UCLA: 1841.40
Missouri St.: 1838.56
Seattle U: 1834.57
Providence: 1828.44
Charlotte: 1825.46
Cornell: 1814.63
Syracuse: 1814.44
Penn: 1810.70
NC State: 1808.53
Kentucky: 1796.26
Kansas City: 1790.35
FIU: 1786.17
James Madison: 1785.56
California: 1780.11
Creighton: 1779.41
Louisville: 1776.81
Princeton: 1763.49
Michigan: 1757.03
St. John's (NY): 1754.35
Saint Louis: 1753.72
UCF: 1750.88
Mich. St.: 1747.72
Albany (NY): 1742.73
Seattle: 1731.64
Northwestern: 1729.30
Mass

In [None]:
def calculate_win_probability(elo_system, team1, team2):
    if team1 in elo_system.ratings and team2 in elo_system.ratings:
        rating1 = elo_system.ratings[team1]
        rating2 = elo_system.ratings[team2]
        expected_score_team1 = 1 / (1 + 10 ** ((rating2 - rating1) / 400))
        expected_score_team2 = 1 - expected_score_team1
        return expected_score_team1, expected_score_team2
    else:
        return None  # Handle case where one or both teams are not in the ratings




In [None]:
team1 = input("Enter the name of Team 1: ")
team2 = input("Enter the name of Team 2: ")

probabilities = calculate_win_probability(elo_system, team1, team2)
if probabilities:
    print(f"Probability of {team1} winning: {probabilities[0] * 100:.2f}%")
    print(f"Probability of {team2} winning: {probabilities[1] * 100:.2f}%")
else:
    print("One or both of the teams are not found in the current Elo ratings.")


Enter the name of Team 1: Dartmouth
Enter the name of Team 2: Yale
Probability of Dartmouth winning: 38.84%
Probability of Yale winning: 61.16%
