# Read data from file and understand data set

In [None]:
# Use pandas as analysis tool, use pathlib to handle file paths
# Open nfl_offensive_stats.csv, and read in dataframe from the file
import pandas as pd
from pathlib import Path
# Define the path to the CSV file
file_path = Path.cwd() / '..' / 'restricted' / 'assets' / 'nfl_stats'/'nfl_offensive_stats.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
# Rename column 'position ' to 'position'
df.rename(columns={'position ': 'position'}, inplace=True)
# Print the first 5 rows of the DataFrame
df.head()

In [None]:
# Get the sum of pass_yds where player is "Aaron Rodgers"
sum_pass_yds = df.loc[df['player'] == 'Aaron Rodgers', 'pass_yds'].sum()

# Print the sum of pass yards for Aaron Rodgers
print(f"Total passing yards for Aaron Rodgers: {sum_pass_yds}")

In [None]:
# For each player whose position is 'QB', get the sum of pass_yds for each player, order by sum of pass_yds descending
qb_pass_yds = df[df['position'] == 'QB'].groupby('player')['pass_yds'].sum().reset_index()
qb_pass_yds = qb_pass_yds.sort_values(by='pass_yds', ascending=False)
# Print the top 5 players by passing yards
print("Top 5 players by passing yards (QB):")
print(qb_pass_yds.head())

In [None]:
# use pandas plot function, not use matplotlib directly
# plot the players by their number of passing yards only for players with more than 4000 passing yards
import matplotlib.pyplot as plt
qb_pass_yds_filtered = qb_pass_yds[qb_pass_yds['pass_yds'] > 4000]
qb_pass_yds_filtered.plot(kind='bar', x='player', y='pass_yds', legend=False)
plt.title('Top QB Passing Yards (Over 4000 Yards)')
plt.xlabel('Player')
plt.ylabel('Passing Yards')
plt.xticks(rotation=90)
plt.show()