In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
plt.style.use('fivethirtyeight')

## Load Data 

In [2]:
player_stats = pd.read_csv('player-game-stats.csv')

In [3]:
player_stats.head()

Unnamed: 0,gameId,gameDate,teamId,teamMarket,playerId,fullName,mins,ptsScored,reb,orb,drb,stl,blk,tov,fgm,fga,fgm2,fga2,fgm3,fga3
0,1977819,2021-11-13,103301,Arkansas,1687713,Chance Moore,1.93,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1977819,2021-11-13,103301,Arkansas,950721,Trey Wade,16.23,5,2,1,1,0,1,0,2,4,1,2,1,2
2,1977819,2021-11-13,103301,Arkansas,942800,JD Notae,25.93,18,1,0,1,2,0,0,7,13,3,6,4,7
3,1977819,2021-11-13,103301,Arkansas,1596219,Khalen Robinson,9.72,3,0,0,0,1,0,1,1,3,0,1,1,2
4,1977819,2021-11-13,103301,Arkansas,1596216,Davonte Davis,29.0,10,3,1,2,2,0,2,4,10,2,7,2,3


### 250 minutes minimum

Use logical subsetting on dataframe to have qualified players

In [4]:
minutes_played_df = player_stats.groupby('fullName')[['mins']].agg(np.sum)
qualified = minutes_played_df[minutes_played_df['mins']>=250].reset_index()
qualified_players = qualified['fullName']
player_stats_qualified = player_stats[player_stats['fullName'].isin(qualified_players)]

## Q1 for one game 

In [5]:
player_stats_qualified['pts/min'] = player_stats_qualified['ptsScored'] / player_stats_qualified['mins']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_stats_qualified['pts/min'] = player_stats_qualified['ptsScored'] / player_stats_qualified['mins']


In [6]:
player_stats_qualified.sort_values('pts/min', ascending = False)['fullName'][0:3]

5504     Clarence Nadolny
24067          Alyn Breed
8329         Bijan Cortes
Name: fullName, dtype: object

## for season 

In [7]:
player_stats_qualified.groupby('fullName')[['pts/min']].agg(np.mean).sort_values('pts/min',ascending=False)[0:3]

Unnamed: 0_level_0,pts/min
fullName,Unnamed: 1_level_1
Keegan Murray,0.776261
Zach Edey,0.763278
Tari Eason,0.699757


## Q2

Group by Name and aggregate by sum to get full season stats. Then logical subset to find qualified players

In [8]:
#minimum 20 att
attempts_df = player_stats_qualified.groupby('fullName')[['fga3']].agg(np.sum)
qualified_att = attempts_df[attempts_df['fga3']>=20].reset_index()
qualified_players2 = qualified_att['fullName']
player_stats_qualified2 = player_stats_qualified[player_stats_qualified['fullName'].isin(qualified_players2)]

In [9]:
player_stats_qualified2['3pt%'] = (player_stats_qualified2['fgm3'] / player_stats_qualified2['fga3']) *100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_stats_qualified2['3pt%'] = (player_stats_qualified2['fgm3'] / player_stats_qualified2['fga3']) *100


In [10]:
player_stats_qualified2.groupby('fullName')[['3pt%']].agg(np.mean).sort_values('3pt%',ascending=False)[0:3]

Unnamed: 0_level_0,3pt%
fullName,Unnamed: 1_level_1
Shane Dezonie,53.333333
Olivier Nkamhoua,53.205128
Harrison Prieto,51.388889


## Q3

aggregate rows using variance calculation

In [11]:
names_by_variance = player_stats_qualified2.groupby('fullName')[['3pt%']].agg(np.var).sort_values('3pt%',ascending=False)[0:3]
names_by_variance.rename(columns = {'3pt%' : '3pt var'})

Unnamed: 0_level_0,3pt var
fullName,Unnamed: 1_level_1
Brock Cunningham,2368.373275
Ethan Morton,2367.816092
Ahmad Rand,2344.771242


## Q4
group by team and game date and aggregate points by the max. Retrieve names

In [13]:
gpby_games=player_stats.groupby(['teamId','gameDate']).agg({'ptsScored':np.max,'fullName':'min'}).reset_index()
gpby_games

Unnamed: 0,teamId,gameDate,ptsScored,fullName
0,103257,2021-11-09,23,Brandon Newman
1,103257,2021-11-12,27,Brandon Newman
2,103257,2021-11-16,20,Brandon Newman
3,103257,2021-11-20,23,Brandon Newman
4,103257,2021-11-21,21,Brandon Newman
...,...,...,...,...
2561,104509,2022-03-15,14,Adam Kunkel
2562,104509,2022-03-20,16,Adam Kunkel
2563,104509,2022-03-22,16,Adam Kunkel
2564,104509,2022-03-29,18,Adam Kunkel


Now group by team to find number of team games.... then group by both team and name to find count of each player being lead scorer

In [14]:
gpby_teams=gpby_games.groupby('teamId').size().to_frame()
#gpby_teams.columns = ['# team games']

In [15]:
gpby_names_team = gpby_games.groupby(['fullName','teamId']).size().to_frame()
gpby_names_team

Unnamed: 0_level_0,Unnamed: 1_level_0,0
fullName,teamId,Unnamed: 2_level_1
A.J. Hoggard,103903,36
A.J. Reeves,104112,28
AJ Griffin,103549,39
AJ Wilson,104229,22
Aaron Cash,104320,37
...,...,...
Jaheam Cornwall,104078,1
Jamaine Mann,104399,7
Jordan Geronimo,103730,17
Jordan Wright,104399,2


Merge the two dataframes on teamId to easily access how many of a given team's games a player was the leading scorer

In [16]:
#pd.merge(left = gpby_names_team , right = gpby_teams, left_on = "teamId", right_on="teamId")
final_df = gpby_names_team.merge(gpby_teams, how="left", left_on = "teamId", right_index = True)


In [17]:
final_df = final_df.rename(columns = {'0_x' : 'times leading scorer', '0_y': '# team games'})

In [18]:
final_df['% games leading scorer'] = (final_df['times leading scorer'] / final_df['# team games'])*100 

In [19]:
final_df.sort_values('% games leading scorer', ascending=False)[0:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,times leading scorer,# team games,% games leading scorer
fullName,teamId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A.J. Hoggard,103903,36,36,100.0
Alondes Williams,104413,35,35,100.0
Brandon Johnson,103525,31,31,100.0
