In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
players_info = pd.read_csv("People.csv")
players_info

In [None]:
salary_data = pd.read_csv("Salaries.csv")

salary_data

In [None]:
players_info.columns.unique()

In [None]:
player_awards = pd.read_csv("AwardsPlayers.csv")
player_awards

In [None]:
batting = pd.read_csv("Batting.csv")
batting.columns

In [None]:
player_awards = player_awards[["playerID", "awardID", "yearID", "lgID"]]
players_info = players_info[["playerID", "weight", "height", "bats", "birthCountry", "nameFirst", "nameLast"]]
batting = batting[["playerID", "yearID", "lgID", "teamID", "H", "AB"]]
players_info = pd.merge(players_info, salary_data, on="playerID")
players_info = pd.merge(players_info, player_awards, on=["playerID", "lgID", "yearID"], how="left")
players_info = pd.merge(players_info, batting, on=["playerID", "lgID", "teamID", "yearID"])
players_info

In [None]:
PR_players = players_info[players_info.birthCountry == "P.R."]
PR_players = PR_players.dropna(subset="awardID")
display(PR_players.head(3))
PR_players_based_on_awards = PR_players.groupby(["playerID"])["awardID"].count().reset_index().sort_values(by="awardID", ascending=False)
PR_players_based_on_awards.head(3)

In [None]:
player_with_highest_number_of_awards = PR_players_based_on_awards[["playerID"]].iloc[0][0]
awards_per_year = PR_players[PR_players.playerID == player_with_highest_number_of_awards]
award_info = awards_per_year.groupby(["yearID"])["awardID"].count().reset_index()
salary_info = PR_players[PR_players.playerID == player_with_highest_number_of_awards].groupby(["yearID"])["salary"].sum().reset_index()


fig, axs = plt.subplots(2, 2, figsize=(15, 10))

axs[0, 0].plot(award_info["yearID"], award_info["awardID"])
axs[0, 0].set_title('Award Number')
axs[0, 0].set_xticks(np.arange(award_info["yearID"].min(), award_info["yearID"].max() + 1, 1))
axs[0, 0].set_yticks(np.arange(award_info["awardID"].min(), award_info["awardID"].max() + 1, 1))
axs[0, 0].tick_params(axis='x', rotation=45)


axs[0, 1].plot(salary_info["yearID"], salary_info["salary"])
axs[0, 1].set_title('Salary')
axs[0, 1].set_xticks(np.arange(salary_info["yearID"].min(), salary_info["yearID"].max() + 1, 1))
axs[0, 1].tick_params(axis='x', rotation=45)


axs[1, 0].bar(award_info["yearID"], award_info["awardID"], color='green')  
axs[1, 0].set_title('Award Number')
axs[1, 0].set_xticks(np.arange(award_info["yearID"].min(), award_info["yearID"].max() + 1, 1))
axs[1, 0].tick_params(axis='x', rotation=45)


axs[1, 1].bar(salary_info["yearID"], salary_info["salary"], color='blue') 
axs[1, 1].set_title('Salary')
axs[1, 1].set_xticks(np.arange(salary_info["yearID"].min(), salary_info["yearID"].max() + 1, 1))
axs[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
BMI_impact_on_run = players_info[["playerID", "teamID", "weight", "height", "H", "AB"]]
BMI_impact_on_run = BMI_impact_on_run.dropna(subset="H")
BMI_impact_on_run = BMI_impact_on_run.dropna(subset="AB")
BMI_impact_on_run = BMI_impact_on_run.dropna(subset="height")
BMI_impact_on_run = BMI_impact_on_run.dropna(subset="playerID")
BMI_impact_on_run = BMI_impact_on_run.dropna(subset="weight")
BMI_impact_on_run = BMI_impact_on_run.groupby(["playerID", "weight", "height"]).agg({"H": "sum", "AB": "sum"}).reset_index()
BMI_impact_on_run['BMI'] = (BMI_impact_on_run.weight * 703) / (BMI_impact_on_run.height)**2
BMI_impact_on_run['battingAvg'] = (BMI_impact_on_run.H / BMI_impact_on_run.AB)
BMI_impact_on_run = BMI_impact_on_run.dropna(subset="battingAvg")
display((BMI_impact_on_run))
sns.scatterplot(x="BMI", y="battingAvg", data=BMI_impact_on_run, size=8)
plt.ylabel("Batting average")
plt.show()

In [None]:
salary_data.lgID.unique()

In [None]:
national_league = players_info[(players_info.lgID == "NL") & (players_info.yearID > 2012)]
american_league = players_info[(players_info.lgID == "AL") & (players_info.yearID > 2012)]

In [None]:
NL_bats_salary = national_league.groupby("bats")["salary"].mean()
AL_bats_salary = american_league.groupby("bats")["salary"].mean()
display(NL_bats_salary)
display(AL_bats_salary)
categories = ["B", "L", "R"]
NL_bats_salary = [NL_bats_salary[0], NL_bats_salary[1], NL_bats_salary[2]]
AL_bats_salary = [AL_bats_salary[0], AL_bats_salary[1], AL_bats_salary[2]]


data = {
    'Bats': ['National League'] * 3 + ['American League'] * 3,
    'Batting_Style': ['B', 'L', 'R'] * 2,
    'Salary': NL_bats_salary + AL_bats_salary
}

# Create the bar plot using Seaborn
sns.barplot(x='Bats', y='Salary', hue='Batting_Style', data=data)

# Customize the plot
plt.xlabel('League')
plt.ylabel('Salary')
plt.title('Comparison of Average Salaries by Batting Style and League')
plt.legend(title='Batting Style', bbox_to_anchor=(1, 1), loc='upper left')
# Show the plot
plt.show()


In [None]:
national_league = players_info[(players_info.lgID == "NL")]
american_league = players_info[(players_info.lgID == "AL")]

In [None]:
national_league_spent_money = national_league[["yearID" , "salary"]]
american_league_spent_money = american_league[["yearID" , "salary"]]


In [None]:
national_league_spent_money

In [None]:
american_league_spent_money

In [None]:
national_league_spent_money = national_league_spent_money.groupby('yearID')['salary'].sum()
american_league_spent_money = american_league_spent_money.groupby('yearID')['salary'].sum()

national_league_spent_money = national_league_spent_money.reset_index()
american_league_spent_money = american_league_spent_money.reset_index()

In [None]:
national_league_spent_money.columns = ["Year", "NA_spent_salary"]
american_league_spent_money.columns = ["Year", "AL_spent_salary"]
sns.lineplot(x="Year", y="NA_spent_salary", data=national_league_spent_money, color='red', label='NL')
sns.lineplot(x="Year", y="AL_spent_salary", data=american_league_spent_money, color='blue', label='AL')
plt.xlabel("Year")
plt.ylabel("The Amount of money spent per year")
plt.show()

In [None]:
among_years_1990_2014 = salary_data[(salary_data.yearID >= 1990) & (salary_data.yearID < 2014)]
among_years_1990_2014.reset_index()
among_years_1990_2014

In [None]:
expensive_players = players_info[players_info.salary > 1000000]

expensive_players_not_USA = expensive_players[expensive_players["birthCountry"] != "USA"]

expensive_players_not_USA

In [None]:
unique_non_US_player = expensive_players_not_USA.drop_duplicates(subset=["playerID", "birthCountry"])
countries_count = unique_non_US_player["birthCountry"].value_counts()
countries_count = countries_count.reset_index()
countries_count.columns = ['Countries', 'Count']
countries_count

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(25, 10))
axs[0].bar(countries_count["Countries"], countries_count["Count"])
axs[0].tick_params(axis='x', rotation=45)
axs[1].pie(countries_count["Count"], labels=countries_count["Countries"])
plt.tight_layout()
plt.show()