In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

march_machine_learning_mania_2025_path = kagglehub.competition_download('march-machine-learning-mania-2025')

print('Data source import complete.')


<div style="font-family: Arial, sans-serif; text-align: center; padding: 40px 20px; background-color: #f0f8ff; border-radius: 10px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); margin: 20px auto; max-width: 600px;">
  <h1 style="color: #3366cc; font-size: 2.5em; margin-bottom: 10px; text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1);">March Machine Learning Mania 2025</h1>
  <h2 style="color: #555; font-size: 1.5em; font-weight: 300; margin-bottom: 20px;">Forecast the 2025 NCAA Basketball Tournaments</h2>
  <p style="color: #666; line-height: 1.6;"><center>Get ready to predict the outcomes of the 2025 NCAA Men's and Women's Basketball Tournaments using machine learning.</center></p>
</div>

<h2 style="font-size: 2em; color: #007bff; border-bottom: 2px solid #ddd; padding-bottom: 10px; margin-bottom: 20px; font-family: sans-serif;">IMPORTING LIBRARIES</h2>

In [None]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.simplefilter('ignore')

<h2 style="font-size: 2em; color: #007bff; border-bottom: 2px solid #ddd; padding-bottom: 10px; margin-bottom: 20px; font-family: sans-serif;">READING AND CREATING A DATAFRAME</h2>

In [None]:
def read_csv_files_in_folder(folder_path):
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' does not exist.")
        return

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df_name = "df_" + os.path.splitext(filename)[0]

            try:
                df = pd.read_csv(file_path)
                globals()[df_name] = df
                print(f"Read and created DataFrame: {df_name} from {filename}")
            except Exception as e:
                print(f"Error reading {filename}: {e}")

folder_path = '/kaggle/input/march-machine-learning-mania-2025'
read_csv_files_in_folder(folder_path)

<h2 style="font-size: 2em; color: #007bff; border-bottom: 2px solid #ddd; padding-bottom: 10px; margin-bottom: 20px; font-family: sans-serif;">CHECKING FOR MISSING VALUES</h2>

In [None]:
def load_all_csv_to_dict(folder_path):
    dataframes = {}
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' does not exist.")
        return dataframes

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df_name = os.path.splitext(filename)[0]

            try:
                df = pd.read_csv(file_path)
                dataframes[df_name] = df
                print(f"Loaded {filename}")
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    return dataframes

def check_missing_values(dataframes):
    for df_name, df in dataframes.items():
        print(f"\nDataFrame: {df_name}")
        missing_values = df.isnull().sum()
        if missing_values.sum() > 0:
            print("Missing Values:")
            print(missing_values[missing_values > 0])
        else:
            print("No missing values found.")

folder_path = '/kaggle/input/march-machine-learning-mania-2025'
dataframes = load_all_csv_to_dict(folder_path)

if dataframes:
    check_missing_values(dataframes)

In [None]:
dfs = {
    "Cities": df_Cities,
    "Conferences": df_Conferences,
    "MConferenceTourneyGames": df_MConferenceTourneyGames,
    "MGameCities": df_MGameCities,
    "MMasseyOrdinals": df_MMasseyOrdinals,
    "MNCAATourneyCompactResults": df_MNCAATourneyCompactResults,
    "MNCAATourneyDetailedResults": df_MNCAATourneyDetailedResults,
    "MNCAATourneySeedRoundSlots": df_MNCAATourneySeedRoundSlots,
    "MNCAATourneySeeds": df_MNCAATourneySeeds,
    "MNCAATourneySlots": df_MNCAATourneySlots,
    "MRegularSeasonCompactResults": df_MRegularSeasonCompactResults,
    "MRegularSeasonDetailedResults": df_MRegularSeasonDetailedResults,
    "MSeasons": df_MSeasons,
    "MSecondaryTourneyCompactResults": df_MSecondaryTourneyCompactResults,
    "MSecondaryTourneyTeams": df_MSecondaryTourneyTeams,
    "MTeamCoaches": df_MTeamCoaches,
    "MTeamConferences": df_MTeamConferences,
    "MTeamSpellings": df_MTeamSpellings,
    "MTeams": df_MTeams,
    "SampleSubmissionStage1": df_SampleSubmissionStage1,
    "SampleSubmissionStage2": df_SampleSubmissionStage2,
    "SeedBenchmarkStage1": df_SeedBenchmarkStage1,
    "WConferenceTourneyGames": df_WConferenceTourneyGames,
    "WGameCities": df_WGameCities,
    "WNCAATourneyCompactResults": df_WNCAATourneyCompactResults,
    "WNCAATourneyDetailedResults": df_WNCAATourneyDetailedResults,
    "WNCAATourneySeeds": df_WNCAATourneySeeds,
    "WNCAATourneySlots": df_WNCAATourneySlots,
    "WRegularSeasonCompactResults": df_WRegularSeasonCompactResults,
    "WRegularSeasonDetailedResults": df_WRegularSeasonDetailedResults,
    "WSeasons": df_WSeasons,
    "WSecondaryTourneyCompactResults": df_WSecondaryTourneyCompactResults,
    "WSecondaryTourneyTeams": df_WSecondaryTourneyTeams,
    "WTeamConferences": df_WTeamConferences,
    "WTeamSpellings": df_WTeamSpellings,
    "WTeams": df_WTeams
}

<h2 style="font-size: 2em; color: #007bff; border-bottom: 2px solid #ddd; padding-bottom: 10px; margin-bottom: 20px; font-family: sans-serif;">SHAPE OF DATA</h2>

In [None]:
for name, df in dfs.items():
    print(f"\n{name}: {df.shape} (Rows, Columns)")

<h2 style="font-size: 2em; color: #ff0000; border-bottom: 2px solid #000000; padding-bottom: 10px; margin-bottom: 20px; font-family: sans-serif;">EDA BEGINS</h2>

In [None]:
df_men = df_MRegularSeasonCompactResults.copy()
df_women = df_WRegularSeasonCompactResults.copy()

## STATS

In [None]:
# Calculating win margins
df_men["WinMargin"] = df_men["WScore"] - df_men["LScore"]
df_women["WinMargin"] = df_women["WScore"] - df_women["LScore"]

print("Men's Regular Season Stats:")
display(df_men.describe())

print("\nWomen's Regular Season Stats:")
display(df_women.describe())

<style>
.data-summary {
  font-family: sans-serif;
  line-height: 1.6;
  border: 1px solid #ddd;
  padding: 20px;
  border-radius: 5px;
  background-color: #f9f9f9;
}

.data-summary h2 {
  margin-top: 0;
  color: #333;
}

.data-summary ol {
  padding-left: 20px;
}

.data-summary li {
  margin-bottom: 15px;
}

.data-summary strong {
  font-weight: bold;
}

.data-summary p {
  margin-top: 5px;
  color: #555;
}
</style>

<div class="data-summary">
  <h2>Data Summary</h2>
  <ol>
    <li>
      <strong>Season Coverage</strong>
      <p>Men's data spans 1985 to 2025 (41 seasons).</p>
      <p>Women's data spans 1998 to 2025 (28 seasons).</p>
    </li>
    <li>
      <strong>Scoring Differences</strong>
      <p>Men's average winning score: 76.86</p>
      <p>Women's average winning score: 71.92</p>
      <p>Men's teams tend to score slightly higher on average.</p>
    </li>
    <li>
      <strong>Win Margins</strong>
      <p>Men’s average win margin: 12.08 points</p>
      <p>Women’s average win margin: 14.39 points</p>
      <p>Women’s games show a slightly higher spread in win margins.</p>
    </li>
    <li>
      <strong>Maximum Scores</strong>
      <p>Men's highest game score: 186 points</p>
      <p>Women's highest game score: 140 points</p>
      <p>Women's highest win margin: 108 points (vs. 94 for men)</p>
      <p>→ Some very dominant performances in women's games.</p>
    </li>
    <li>
      <strong>Overtime Games</strong>
      <p>Men’s games: ~5% had overtime.</p>
      <p>Women’s games: ~4% had overtime.</p>
      <p>→ Similar distribution of close matches.</p>
    </li>
  </ol>
</div>

# Distribution of Win Margins

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

axes[0].hist(df_men["WinMargin"], bins=30, color="blue", alpha=0.7)
axes[0].set_title("Men's Win Margin Distribution")
axes[0].set_xlabel("Win Margin")
axes[0].set_ylabel("Frequency")

axes[1].hist(df_women["WinMargin"], bins=30, color="red", alpha=0.7)
axes[1].set_title("Women's Win Margin Distribution")
axes[1].set_xlabel("Win Margin")

plt.tight_layout()
plt.show()

# Average Win Margins over Season

In [None]:
df_MRegularSeasonCompactResults["WinMargin"] = df_MRegularSeasonCompactResults["WScore"] - df_MRegularSeasonCompactResults["LScore"]
df_WRegularSeasonCompactResults["WinMargin"] = df_WRegularSeasonCompactResults["WScore"] - df_WRegularSeasonCompactResults["LScore"]

men_seasonal_win_margin = df_MRegularSeasonCompactResults.groupby("Season")["WinMargin"].mean()
women_seasonal_win_margin = df_WRegularSeasonCompactResults.groupby("Season")["WinMargin"].mean()

plt.figure(figsize=(12, 6))
plt.plot(men_seasonal_win_margin.index, men_seasonal_win_margin, label="Men's Win Margin", marker='o')
plt.plot(women_seasonal_win_margin.index, women_seasonal_win_margin, label="Women's Win Margin", marker='s')
plt.title("Average Win Margin Over Seasons", fontsize=14)
plt.xlabel("Season", fontsize=12)
plt.ylabel("Average Win Margin", fontsize=12)
plt.legend()
plt.grid(True)
plt.show()

fig = px.line(x=men_seasonal_win_margin.index, y=men_seasonal_win_margin, labels={'x': 'Season', 'y': 'Win Margin'},
              title="Average Win Margin Over Seasons (Men vs Women)", line_shape='linear')
fig.add_scatter(x=women_seasonal_win_margin.index, y=women_seasonal_win_margin, mode='lines', name="Women's Win Margin")
fig.show()

# Average Winning Score by Season

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

sns.lineplot(x="Season", y="WScore", data=df_men, ax=ax[0], label="Men", color="blue")
ax[0].set_title("Men's Average Winning Score per Season")
ax[0].set_ylabel("Winning Score")
ax[0].set_xlabel("Season")

sns.lineplot(x="Season", y="WScore", data=df_women, ax=ax[1], label="Women", color="red")
ax[1].set_title("Women's Average Winning Score per Season")
ax[1].set_ylabel("")
ax[1].set_xlabel("Season")

plt.tight_layout()
plt.show()

# Home Win Percentage for Men

In [None]:
home_wins_men = df_MRegularSeasonCompactResults[df_MRegularSeasonCompactResults["WLoc"] == "H"].shape[0]
away_wins_men = df_MRegularSeasonCompactResults[df_MRegularSeasonCompactResults["WLoc"] == "A"].shape[0]
neutral_games_men = df_MRegularSeasonCompactResults[df_MRegularSeasonCompactResults["WLoc"] == "N"].shape[0]

home_win_rate_men = home_wins_men / (home_wins_men + away_wins_men) * 100

print(f"🏀 Men's Home Win Rate: {home_win_rate_men:.2f}%")

# Home Win Percentage for Women

In [None]:
home_wins_women = df_WRegularSeasonCompactResults[df_WRegularSeasonCompactResults["WLoc"] == "H"].shape[0]
away_wins_women = df_WRegularSeasonCompactResults[df_WRegularSeasonCompactResults["WLoc"] == "A"].shape[0]
neutral_games_women = df_WRegularSeasonCompactResults[df_WRegularSeasonCompactResults["WLoc"] == "N"].shape[0]

home_win_rate_women = home_wins_women / (home_wins_women + away_wins_women) * 100

print(f"🏀 Women's Home Win Rate: {home_win_rate_women:.2f}%")

# Home VS Away Marings of Win

In [None]:
# Win Margin for Men & Women
df_MRegularSeasonCompactResults["WinMargin"] = df_MRegularSeasonCompactResults["WScore"] - df_MRegularSeasonCompactResults["LScore"]
df_WRegularSeasonCompactResults["WinMargin"] = df_WRegularSeasonCompactResults["WScore"] - df_WRegularSeasonCompactResults["LScore"]

df_men_home = df_MRegularSeasonCompactResults[df_MRegularSeasonCompactResults["WLoc"] == "H"]
df_men_away = df_MRegularSeasonCompactResults[df_MRegularSeasonCompactResults["WLoc"] == "A"]

df_women_home = df_WRegularSeasonCompactResults[df_WRegularSeasonCompactResults["WLoc"] == "H"]
df_women_away = df_WRegularSeasonCompactResults[df_WRegularSeasonCompactResults["WLoc"] == "A"]

df_home_away = pd.DataFrame({
    "Category": ["Men - Home", "Men - Away", "Women - Home", "Women - Away"],
    "Win Margin": [
        df_men_home["WinMargin"].mean(), df_men_away["WinMargin"].mean(),
        df_women_home["WinMargin"].mean(), df_women_away["WinMargin"].mean()
    ]
})

plt.figure(figsize=(8, 5))
sns.barplot(x="Category", y="Win Margin", data=df_home_away, palette=["blue", "blue", "red", "red"])
plt.title("Home vs. Away Win Margins")
plt.ylabel("Average Win Margin")
plt.xlabel("")
plt.xticks(rotation=15)
plt.show()

# Win Margins in NCAA Tournaments

In [None]:
df_men_tourney = df_MNCAATourneyCompactResults.copy()
df_women_tourney = df_WNCAATourneyCompactResults.copy()

# Win Margin for Mens & Womens Tournament Games
df_men_tourney["WinMargin"] = df_men_tourney["WScore"] - df_men_tourney["LScore"]
df_women_tourney["WinMargin"] = df_women_tourney["WScore"] - df_women_tourney["LScore"]

# Grouping by Season to get the average Win Margin
men_win_margin_season = df_men_tourney.groupby("Season")["WinMargin"].mean()
women_win_margin_season = df_women_tourney.groupby("Season")["WinMargin"].mean()

plt.figure(figsize=(12, 6))
sns.lineplot(x=men_win_margin_season.index, y=men_win_margin_season.values, label="Men's Tournament", marker="o")
sns.lineplot(x=women_win_margin_season.index, y=women_win_margin_season.values, label="Women's Tournament", marker="o", linestyle="dashed")

plt.xlabel("Season", fontsize=12)
plt.ylabel("Average Win Margin", fontsize=12)
plt.title("Trend of Win Margins in NCAA Tournaments", fontsize=14)
plt.legend()
plt.grid(True)
plt.show()

# Seed Computing

In [None]:
# Extracting numeric seed values for Men
df_MNCAATourneySeeds["SeedValue"] = df_MNCAATourneySeeds["Seed"].str.extract('(\d+)').astype(int)

# Extracting numeric seed values for Women
df_WNCAATourneySeeds["SeedValue"] = df_WNCAATourneySeeds["Seed"].str.extract('(\d+)').astype(int)

In [None]:
# Here merging Mens Tournament Results with Seeds
df_men_tourney = df_MNCAATourneyCompactResults.merge(
    df_MNCAATourneySeeds[["Season", "TeamID", "SeedValue"]],
    left_on=["Season", "WTeamID"],
    right_on=["Season", "TeamID"],
    how="left"
).rename(columns={"SeedValue": "WSeed"}).drop(columns=["TeamID"])

df_men_tourney = df_men_tourney.merge(
    df_MNCAATourneySeeds[["Season", "TeamID", "SeedValue"]],
    left_on=["Season", "LTeamID"],
    right_on=["Season", "TeamID"],
    how="left"
).rename(columns={"SeedValue": "LSeed"}).drop(columns=["TeamID"])

df_men_tourney["WinMargin"] = df_men_tourney["WScore"] - df_men_tourney["LScore"]

# Mergeing Womens Tournament Results with Seeds
df_women_tourney = df_WNCAATourneyCompactResults.merge(
    df_WNCAATourneySeeds[["Season", "TeamID", "SeedValue"]],
    left_on=["Season", "WTeamID"],
    right_on=["Season", "TeamID"],
    how="left"
).rename(columns={"SeedValue": "WSeed"}).drop(columns=["TeamID"])

df_women_tourney = df_women_tourney.merge(
    df_WNCAATourneySeeds[["Season", "TeamID", "SeedValue"]],
    left_on=["Season", "LTeamID"],
    right_on=["Season", "TeamID"],
    how="left"
).rename(columns={"SeedValue": "LSeed"}).drop(columns=["TeamID"])

df_women_tourney["WinMargin"] = df_women_tourney["WScore"] - df_women_tourney["LScore"]

display(df_men_tourney.head(), df_women_tourney.head(3))

In [None]:
df_men_tourney["WSeed"] = df_men_tourney["WSeed"].astype(str).str.extract("(\d+)").astype(float)
df_men_tourney["LSeed"] = df_men_tourney["LSeed"].astype(str).str.extract("(\d+)").astype(float)
df_men_tourney["Seed_Diff"] = df_men_tourney["LSeed"] - df_men_tourney["WSeed"]

df_women_tourney["WSeed"] = df_women_tourney["WSeed"].astype(str).str.extract("(\d+)").astype(float)
df_women_tourney["LSeed"] = df_women_tourney["LSeed"].astype(str).str.extract("(\d+)").astype(float)
df_women_tourney["Seed_Diff"] = df_women_tourney["LSeed"] - df_women_tourney["WSeed"]

## Mens and Women Seed Difference VS Win Margin

In [None]:
plt.figure(figsize=(14, 6))

# Men's Tournament
plt.subplot(1, 2, 1)
sns.scatterplot(x=df_men_tourney["Seed_Diff"], y=df_men_tourney["WinMargin"], alpha=0.5)
sns.regplot(x=df_men_tourney["Seed_Diff"], y=df_men_tourney["WinMargin"], scatter=False, color="red")
plt.axhline(0, color="black", linestyle="dashed", alpha=0.7)
plt.xlabel("Seed Difference (LSeed - WSeed)")
plt.ylabel("Win Margin")
plt.title("Men's Tournament: Seed Difference vs. Win Margin")
plt.grid(True)

# Women's Tournament
plt.subplot(1, 2, 2)
sns.scatterplot(x=df_women_tourney["Seed_Diff"], y=df_women_tourney["WinMargin"], alpha=0.5)
sns.regplot(x=df_women_tourney["Seed_Diff"], y=df_women_tourney["WinMargin"], scatter=False, color="red")
plt.axhline(0, color="black", linestyle="dashed", alpha=0.7)
plt.xlabel("Seed Difference (LSeed - WSeed)")
plt.ylabel("Win Margin")
plt.title("Women's Tournament: Seed Difference vs. Win Margin")
plt.grid(True)

plt.tight_layout()
plt.show()

### Insights: Seed Difference vs. Win Margin in NCAA Tournaments with Trendlines:👆

**Strong Positive Correlation Confirmed:**

* **Trendline Visualization:** The addition of trendlines clearly illustrates the strong positive correlation between seed difference and win margin in both men's and women's tournaments.
* **Higher Seed Advantage:** As the seed difference increases (higher seeds playing lower seeds), the trendlines show a consistent increase in win margin, reinforcing the advantage of higher-seeded teams.

**Observations Across Tournaments:**

* **Similar Trendline Slopes:** The trendlines for both men's and women's tournaments have similar slopes, suggesting a consistent relationship between seed difference and win margin across genders.
* **Spread Around Trendlines:**
    * **Men's:** The men's data points are more tightly clustered around the trendline, indicating a slightly stronger correlation compared to the women's tournament.
    * **Women's:** The women's data points show a wider spread, suggesting that other factors might have a more significant impact on win margins in the women's tournament.
* **Negative Seed Differences:**
    * **Close Upsets:** For negative seed differences (lower seeds winning), the trendlines are close to the zero win margin line, confirming that upsets tend to be close games.
* **Positive Seed Differences:**
    * **Varied Margins:** For positive seed differences (higher seeds winning), the trendlines show increasing win margins, but the spread of data points indicates that the actual margins can vary widely.

# Seed Difference VS Win Margin in NCAA Tournaments

In [None]:
plt.figure(figsize=(12, 6))

# Men's Tournament
sns.scatterplot(x=df_men_tourney["Seed_Diff"], y=df_men_tourney["WinMargin"], alpha=0.6, label="Men's Tournament", color="blue")

# Women's Tournament
sns.scatterplot(x=df_women_tourney["Seed_Diff"], y=df_women_tourney["WinMargin"], alpha=0.6, label="Women's Tournament", color="red")

plt.axvline(x=0, color="gray", linestyle="--", linewidth=1)  # Reference line at Seed Difference = 0
plt.xlabel("Seed Difference (Lower Seed - Higher Seed)", fontsize=12)
plt.ylabel("Win Margin", fontsize=12)
plt.title("Seed Difference vs. Win Margin in NCAA Tournaments", fontsize=14)
plt.legend()
plt.grid(True)
plt.show()

### Insights: Seed Difference vs. Win Margin in NCAA Tournaments👆

**General Trend: Higher Seed Advantage**

* **Positive Correlation:** Both men's and women's tournaments show a clear positive correlation between seed difference (lower seed minus higher seed) and win margin. This means that as the seed difference increases (higher seeds playing lower seeds), the win margin tends to increase.
* **Dominance of Higher Seeds:** Higher-seeded teams generally win by larger margins, reinforcing the idea that seeding is a good indicator of team strength.

**Observations Across Tournaments**

* **Similar Patterns:** The overall patterns are quite similar for both men's and women's tournaments. This suggests that the relationship between seed difference and win margin is consistent across genders.
* **Spread of Data:**
    * For negative seed differences (lower seeds beating higher seeds), the win margins are generally smaller and more tightly clustered near the bottom of the graph, showing that upsets tend to be close games.
    * For positive seed differences (higher seeds beating lower seeds), the win margins are more widely spread, indicating that higher seeds can win by a wide variety of margins.

# Pearson correlation coefficient

In [None]:
corr_men = df_men_tourney["Seed_Diff"].corr(df_men_tourney["WinMargin"])
corr_women = df_women_tourney["Seed_Diff"].corr(df_women_tourney["WinMargin"])

print(f"📌 Pearson Correlation (Men's Tournament): {corr_men:.4f}")
print(f"📌 Pearson Correlation (Women's Tournament): {corr_women:.4f}")

# Most Successfull Teams in NCAA Tournament

In [None]:
men_team_wins = df_MNCAATourneyCompactResults["WTeamID"].value_counts().reset_index()
men_team_wins.columns = ["TeamID", "Total Wins"]

women_team_wins = df_WNCAATourneyCompactResults["WTeamID"].value_counts().reset_index()
women_team_wins.columns = ["TeamID", "Total Wins"]

men_team_wins = men_team_wins.merge(df_MTeams, on="TeamID", how="left")
women_team_wins = women_team_wins.merge(df_WTeams, on="TeamID", how="left")

# Sorting by most wins
top_men_teams = men_team_wins.sort_values(by="Total Wins", ascending=False).head(10)
top_women_teams = women_team_wins.sort_values(by="Total Wins", ascending=False).head(10)

display(top_men_teams)
display(top_women_teams)

## Mens VS Womens

In [None]:
plt.figure(figsize=(14, 6))

# Men's Top Teams
plt.subplot(1, 2, 1)
sns.barplot(y=top_men_teams["TeamName"], x=top_men_teams["Total Wins"], palette="Blues_r")
plt.xlabel("Total NCAA Tournament Wins")
plt.ylabel("Team")
plt.title("Top 10 Most Successful Men's Teams")

# Women's Top Teams
plt.subplot(1, 2, 2)
sns.barplot(y=top_women_teams["TeamName"], x=top_women_teams["Total Wins"], palette="Purples_r")
plt.xlabel("Total NCAA Tournament Wins")
plt.ylabel("Team")
plt.title("Top 10 Most Successful Women's Teams")

plt.tight_layout()
plt.show()

# Dominant Teams per Decade

In [None]:
# Categorizing seasons into decades
def get_decade(year):
    return f"{(year // 10) * 10}s"  # Lets say for example: 1985  into 1980s

df_MNCAATourneyCompactResults["Decade"] = df_MNCAATourneyCompactResults["Season"].apply(get_decade)
df_WNCAATourneyCompactResults["Decade"] = df_WNCAATourneyCompactResults["Season"].apply(get_decade)

men_decade_wins = df_MNCAATourneyCompactResults.groupby(["Decade", "WTeamID"]).size().reset_index(name="Total Wins")
women_decade_wins = df_WNCAATourneyCompactResults.groupby(["Decade", "WTeamID"]).size().reset_index(name="Total Wins")

men_decade_wins = men_decade_wins.merge(df_MTeams, left_on="WTeamID", right_on="TeamID", how="left")
women_decade_wins = women_decade_wins.merge(df_WTeams, left_on="WTeamID", right_on="TeamID", how="left")

In [None]:
top_men_teams_decade = men_decade_wins.sort_values(by=["Decade", "Total Wins"], ascending=[True, False]).groupby("Decade").head(5)
top_women_teams_decade = women_decade_wins.sort_values(by=["Decade", "Total Wins"], ascending=[True, False]).groupby("Decade").head(5)

display(top_men_teams_decade)
display(top_women_teams_decade)

## Mens Vs Womens

In [None]:
plt.figure(figsize=(16, 8))

# Men's Plot
plt.subplot(1, 2, 1)
sns.barplot(data=top_men_teams_decade, x="Decade", y="Total Wins", hue="TeamName", palette="Blues_r")
plt.xlabel("Decade")
plt.ylabel("Total Wins")
plt.title("Top 5 Most Dominant Men's Teams per Decade")
plt.xticks(rotation=45)
plt.legend(title="Team", bbox_to_anchor=(1, 1))

# Women's Plot
plt.subplot(1, 2, 2)
sns.barplot(data=top_women_teams_decade, x="Decade", y="Total Wins", hue="TeamName", palette="Purples_r")
plt.xlabel("Decade")
plt.ylabel("Total Wins")
plt.title("Top 5 Most Dominant Women's Teams per Decade")
plt.xticks(rotation=45)
plt.legend(title="Team", bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.show()

### Insights from Top 5 Most Dominant Teams per Decade Analysis 👆

**Men's Tournament: Shifting Dominance and Varied Success:**

* **Decade-Specific Leaders:** The dominant teams vary significantly across decades, indicating a dynamic landscape in men's college basketball.
* **Consistency vs. Emergence:** Some teams (e.g., Duke, Kansas) show consistent dominance across multiple decades, while others (e.g., Gonzaga, Houston) have emerged as top contenders more recently.
* **Win Variation:** The total wins achieved by the top teams fluctuate, suggesting varying levels of overall strength and competition within each decade.
* **Recent Trends:** The 2010s and 2020s show a wider spread of dominant teams, possibly indicating increased parity.

**Women's Tournament: Long-Term Dominance and Emerging Powers:**

* **Long-Standing Powerhouses:** Teams like Louisiana Tech, Tennessee, and Connecticut have established a legacy of dominance, particularly in the earlier decades.
* **Era of Connecticut:** Connecticut's sustained high win totals across multiple decades highlight their exceptional program strength.
* **Rise of New Contenders:** While established teams remain strong, newer contenders like Baylor, Louisville, and South Carolina have emerged as top programs in recent decades.
* **High Win Totals:** The women's chart generally displays higher total wins compared to the men's, suggesting potentially greater consistency and dominance within the top programs.
* **Iowa's Recent Emergence:** Iowa's appearance in the 2020s demonstrates the ever changing landscape of womens basketball.

# Cinderella Runs, Who is the Underdog??👀

In [None]:
df_men_cinderella = df_MNCAATourneyCompactResults.merge(df_MNCAATourneySeeds, left_on=["Season", "WTeamID"], right_on=["Season", "TeamID"], how="left")
df_women_cinderella = df_WNCAATourneyCompactResults.merge(df_WNCAATourneySeeds, left_on=["Season", "WTeamID"], right_on=["Season", "TeamID"], how="left")

df_men_cinderella["Seed"] = df_men_cinderella["Seed"].str.extract("(\d+)").astype(float)
df_women_cinderella["Seed"] = df_women_cinderella["Seed"].str.extract("(\d+)").astype(float)

men_wins = df_men_cinderella.groupby(["Season", "WTeamID", "Seed"]).size().reset_index(name="Wins")
women_wins = df_women_cinderella.groupby(["Season", "WTeamID", "Seed"]).size().reset_index(name="Wins")

# Cinderella Teams means basically a (low-seeded teams with multiple wins)
cinderella_men = men_wins[(men_wins["Seed"] >= 10) & (men_wins["Wins"] >= 3)]
cinderella_women = women_wins[(women_wins["Seed"] >= 10) & (women_wins["Wins"] >= 3)]

cinderella_men = cinderella_men.merge(df_MTeams, left_on="WTeamID", right_on="TeamID", how="left")
cinderella_women = cinderella_women.merge(df_WTeams, left_on="WTeamID", right_on="TeamID", how="left")

display(cinderella_men)
display(cinderella_women)

In [None]:
plt.figure(figsize=(16, 8))

# Men's Cinderella Plot
plt.subplot(1, 2, 1)
sns.barplot(data=cinderella_men, x="Season", y="Wins", hue="TeamName", palette="Greens_r", dodge=True, width=8)  # Increased bar width
plt.xlabel("Season")
plt.ylabel("Tournament Wins")
plt.title("Men's Cinderella Teams (Seed ≥10, Wins ≥3)")
plt.xticks(rotation=45)
plt.legend(title="Team", bbox_to_anchor=(1.02, 1), loc='upper left')  # Adjusted legend position for clarity

# Women's Cinderella Plot
plt.subplot(1, 2, 2)
sns.barplot(data=cinderella_women, x="Season", y="Wins", hue="TeamName", palette="Oranges_r")
plt.xlabel("Season")
plt.ylabel("Tournament Wins")
plt.title("Women's Cinderella Teams (Seed ≥10, Wins ≥3)")
plt.xticks(rotation=45)
plt.legend(title="Team", bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.show()

### Insights from Cinderella Teams Analysis (Seed ≥10, Wins ≥3)👆

**Men's Tournament Shows More Cinderella Stories:**

* The men's tournament has seen a significantly larger number of Cinderella teams (seed ≥10, wins ≥3) compared to the women's tournament.
* This suggests that upsets by lower-seeded teams reaching the Sweet 16 or beyond are more frequent in the men's competition.

**Women's Cinderella Teams Are Less Frequent but Still Notable:**

* While less frequent, the women's tournament has also witnessed Cinderella runs. The chart highlights Gonzaga, Oregon, and Creighton's achievements.
* These instances demonstrate that upsets and deep tournament runs are possible for lower-seeded teams in the women's bracket as well.

**Temporal Distribution of Cinderella Runs:**

* **Men's:** Cinderella runs are spread across various years, with some periods seeing multiple upsets (e.g., late 2000s, mid-2010s). This indicates that there's no single era where upsets are concentrated.
* **Women's:** The women's Cinderella runs shown are clustered in the 2010s and early 2020s. Further analysis may reveal if there are any trends or factors contributing to this clustering.

**Performance Variation Among Cinderella Teams:**

* **Men's:** The number of wins for Cinderella teams in the men's tournament varies, with some reaching the Elite Eight or Final Four (4-5 wins) while others make it to the Sweet 16 (3 wins).
* **Women's:** The three highlighted women's teams have all achieved 3 wins, indicating a Sweet 16 appearance.

# Clutch Factor – Close Game Performance

In [None]:
CLOSE_GAME_MARGIN = 5

# close games for Mens Regular Season
df_men_clutch_regular = df_MRegularSeasonCompactResults[
    (df_MRegularSeasonCompactResults["WScore"] - df_MRegularSeasonCompactResults["LScore"]) <= CLOSE_GAME_MARGIN
]

# close games for Mens Tournament
df_men_clutch_tourney = df_MNCAATourneyCompactResults[
    (df_MNCAATourneyCompactResults["WScore"] - df_MNCAATourneyCompactResults["LScore"]) <= CLOSE_GAME_MARGIN
]

# Same for Womens Regular Season
df_women_clutch_regular = df_WRegularSeasonCompactResults[
    (df_WRegularSeasonCompactResults["WScore"] - df_WRegularSeasonCompactResults["LScore"]) <= CLOSE_GAME_MARGIN
]

# Same for Womens Tournament
df_women_clutch_tourney = df_WNCAATourneyCompactResults[
    (df_WNCAATourneyCompactResults["WScore"] - df_WNCAATourneyCompactResults["LScore"]) <= CLOSE_GAME_MARGIN
]

print("Men's Regular Season Close Games:", df_men_clutch_regular.shape[0])
print("Men's Tournament Close Games:", df_men_clutch_tourney.shape[0])
print("Women's Regular Season Close Games:", df_women_clutch_regular.shape[0])
print("Women's Tournament Close Games:", df_women_clutch_tourney.shape[0])

# Close Game Win Percentages

In [None]:
df_MRegularSeasonCompactResults["WinMargin"] = df_MRegularSeasonCompactResults["WScore"] - df_MRegularSeasonCompactResults["LScore"]
df_MNCAATourneyCompactResults["WinMargin"] = df_MNCAATourneyCompactResults["WScore"] - df_MNCAATourneyCompactResults["LScore"]

df_WRegularSeasonCompactResults["WinMargin"] = df_WRegularSeasonCompactResults["WScore"] - df_WRegularSeasonCompactResults["LScore"]
df_WNCAATourneyCompactResults["WinMargin"] = df_WNCAATourneyCompactResults["WScore"] - df_WNCAATourneyCompactResults["LScore"]


def compute_close_game_win_pct(df, season_type):
    close_wins = df[df["WinMargin"] <= 5].groupby("WTeamID").size().reset_index(name="CloseWins")

    # Total games played per team (as winner)
    total_games = df.groupby("WTeamID").size().reset_index(name="TotalWins")

    close_win_pct = total_games.merge(close_wins, on="WTeamID", how="left").fillna(0)
    close_win_pct["CloseWinPct"] = close_win_pct["CloseWins"] / close_win_pct["TotalWins"]

    # Adding season type for reference
    close_win_pct["SeasonType"] = season_type
    return close_win_pct

men_close_win_pct_reg = compute_close_game_win_pct(df_MRegularSeasonCompactResults, "Regular Season")
men_close_win_pct_tourney = compute_close_game_win_pct(df_MNCAATourneyCompactResults, "Tournament")

women_close_win_pct_reg = compute_close_game_win_pct(df_WRegularSeasonCompactResults, "Regular Season")
women_close_win_pct_tourney = compute_close_game_win_pct(df_WNCAATourneyCompactResults, "Tournament")

display(men_close_win_pct_reg.head(), men_close_win_pct_tourney.head())
display(women_close_win_pct_reg.head(), women_close_win_pct_tourney.head())

In [None]:
# Merging Mens Data
men_close_win_compare = men_close_win_pct_reg.merge(
    men_close_win_pct_tourney, on="WTeamID", suffixes=("_Reg", "_Tourney")
)

# Merging Womens Data
women_close_win_compare = women_close_win_pct_reg.merge(
    women_close_win_pct_tourney, on="WTeamID", suffixes=("_Reg", "_Tourney")
)

display(men_close_win_compare.head(), women_close_win_compare.head())

## Close Game Win% for Regular Season VS Tournament

In [None]:
plt.figure(figsize=(14, 6))

# Mens Tournament Close Win Percentage vs. Regular Season
plt.subplot(1, 2, 1)
sns.scatterplot(x=men_close_win_compare["CloseWinPct_Reg"],
                y=men_close_win_compare["CloseWinPct_Tourney"], alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--', label="1:1 Line")
plt.xlabel("Regular Season Close Win %")
plt.ylabel("Tournament Close Win %")
plt.title("Men's Close Game Win %: Regular Season vs. Tournament")
plt.legend()

# Womens Tournament Close Win Percentage vs. Regular Season
plt.subplot(1, 2, 2)
sns.scatterplot(x=women_close_win_compare["CloseWinPct_Reg"],
                y=women_close_win_compare["CloseWinPct_Tourney"], alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--', label="1:1 Line")
plt.xlabel("Regular Season Close Win %")
plt.ylabel("Tournament Close Win %")
plt.title("Women's Close Game Win %: Regular Season vs. Tournament")
plt.legend()

# Show plots
plt.tight_layout()
plt.show()

### Insights from Close Game Win Percentage Analysis👆

**Men's and Women's Teams Show Similar Trends:**

* Both men's and women's teams exhibit a wide range of close game win percentages in both the regular season and tournaments.
* There's a noticeable cluster of teams with lower close game win percentages in both the regular season and the tournament, particularly for the women's data.

**Tournament Performance Doesn't Perfectly Mirror Regular Season:**

* While there's a general trend of teams with higher regular season close game win percentages also performing better in tournaments, it's not a perfect correlation.
* Many teams deviate from the 1:1 line, indicating that close game performance in the regular season doesn't guarantee similar success in the tournament.
* Some teams significantly underperform in the tournament compared to their regular season close game win percentage, and vice versa.

<div style="text-align: center; padding: 30px; border: 3px solid #007bff; border-radius: 15px; background-color: #f8f9fa; box-shadow: 5px 5px 10px #ddd;">

  <h2 style="color: #007bff; font-family: 'Arial', sans-serif;">Thank You!</h2>

  <p style="font-size: 18px; line-height: 1.6; color: #333; font-family: 'Verdana', sans-serif;">Thank you for staying with me to the end of this exploratory data analysis! I hope you found the insights helpful and the visualizations informative.</p>

  <p style="font-size: 16px; color: #555; font-family: 'Roboto', sans-serif;">If you have any thoughts on how this notebook could be improved, please share your feedback in the comments. I appreciate your input!</p>

  <p style="font-size: 16px; color: #555; font-family: 'Roboto', sans-serif;">If you're interested in seeing how this data can be used for prediction, stay tuned for my upcoming prediction notebook. And if you found this notebook valuable, please consider upvoting it. Your support is greatly appreciated!</p>
</div>