#### Which traditional stat is most important?
* Data Needed: Player name, points, assists, rebounds, FG%, team margin

* Find correlation between winning and an imputed stats (points, assists, rebounds, FG%).
  Scatter Plot

In [25]:
# dependencies
import requests
import json
import os
import pandas as pd
import numpy as np

In [26]:
# consuming data file
filepath = os.path.join('nba_season_stats_kaggle.csv')

tradStats_df = pd.read_csv(filepath)
tradStats_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


### Data Cleansing and Checks

In [27]:
# check all columns and names
tradStats_df.columns

Index(['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2',
       'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [28]:
# checking counts for each year
tradStats_df['Year'].value_counts()

2015.0    651
2011.0    625
2014.0    611
2008.0    595
2017.0    595
         ... 
1959.0    110
1955.0    109
1957.0    107
1956.0    106
1961.0    104
Name: Year, Length: 68, dtype: int64

In [29]:
# reducing the dataset fields to only ibnterested columns
reduced_df = tradStats_df.loc[:, ['Player', 'Year','PTS', 'AST', 'ORB', 'DRB', 'TRB', 'FG', 'FG%']]
reduced_df

Unnamed: 0,Player,Year,PTS,AST,ORB,DRB,TRB,FG,FG%
0,Curly Armstrong,1950.0,458.0,176.0,,,,144.0,0.279
1,Cliff Barker,1950.0,279.0,109.0,,,,102.0,0.372
2,Leo Barnhorst,1950.0,438.0,140.0,,,,174.0,0.349
3,Ed Bartels,1950.0,63.0,20.0,,,,22.0,0.256
4,Ed Bartels,1950.0,59.0,20.0,,,,21.0,0.256
...,...,...,...,...,...,...,...,...,...
24686,Cody Zeller,2017.0,639.0,99.0,135.0,270.0,405.0,253.0,0.571
24687,Tyler Zeller,2017.0,178.0,42.0,43.0,81.0,124.0,78.0,0.494
24688,Stephen Zimmerman,2017.0,23.0,4.0,11.0,24.0,35.0,10.0,0.323
24689,Paul Zipser,2017.0,240.0,36.0,15.0,110.0,125.0,88.0,0.398


In [32]:
# only pick 10 years of dataset from 2007 to 2017
tenYears_df = reduced_df.loc[reduced_df['Year'] >= 2007,:]
tenYears_df

Unnamed: 0,Player,Year,PTS,AST,ORB,DRB,TRB,FG,FG%
18226,Shareef Abdur-Rahim,2007.0,793.0,109.0,122.0,276.0,398.0,310.0,0.474
18227,Hassan Adams,2007.0,174.0,13.0,35.0,42.0,77.0,75.0,0.556
18228,Maurice Ager,2007.0,69.0,7.0,1.0,20.0,21.0,22.0,0.314
18229,LaMarcus Aldridge,2007.0,565.0,24.0,144.0,168.0,312.0,241.0,0.503
18230,Malik Allen,2007.0,242.0,16.0,42.0,77.0,119.0,107.0,0.415
...,...,...,...,...,...,...,...,...,...
24686,Cody Zeller,2017.0,639.0,99.0,135.0,270.0,405.0,253.0,0.571
24687,Tyler Zeller,2017.0,178.0,42.0,43.0,81.0,124.0,78.0,0.494
24688,Stephen Zimmerman,2017.0,23.0,4.0,11.0,24.0,35.0,10.0,0.323
24689,Paul Zipser,2017.0,240.0,36.0,15.0,110.0,125.0,88.0,0.398


In [34]:
# checking data counts
tenYears_df.count()

Player    6455
Year      6455
PTS       6455
AST       6455
ORB       6455
DRB       6455
TRB       6455
FG        6455
FG%       6422
dtype: int64

In [35]:
# checking data types
tenYears_df.dtypes

Player     object
Year      float64
PTS       float64
AST       float64
ORB       float64
DRB       float64
TRB       float64
FG        float64
FG%       float64
dtype: object

In [36]:
# converting Year float type to object then checking afterwards
tenYears_df['Year'] = tenYears_df['Year'].astype(object)
tenYears_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tenYears_df['Year'] = tenYears_df['Year'].astype(object)


Player     object
Year       object
PTS       float64
AST       float64
ORB       float64
DRB       float64
TRB       float64
FG        float64
FG%       float64
dtype: object

In [37]:
# check NaN values before replacing it with 0
tenYears_df.isnull().values.any()

True

In [46]:
# replace NaN with 0 values and check afterwards
tenYears_df = tenYears_df.replace(np.NaN,0)
tenYears_df.isnull().values.any()

False

In [52]:
tenYears_df.reset_index(drop=True)

Unnamed: 0,Player,Year,PTS,AST,ORB,DRB,TRB,FG,FG%
0,Shareef Abdur-Rahim,2007.0,793.0,109.0,122.0,276.0,398.0,310.0,0.474
1,Hassan Adams,2007.0,174.0,13.0,35.0,42.0,77.0,75.0,0.556
2,Maurice Ager,2007.0,69.0,7.0,1.0,20.0,21.0,22.0,0.314
3,LaMarcus Aldridge,2007.0,565.0,24.0,144.0,168.0,312.0,241.0,0.503
4,Malik Allen,2007.0,242.0,16.0,42.0,77.0,119.0,107.0,0.415
...,...,...,...,...,...,...,...,...,...
6450,Cody Zeller,2017.0,639.0,99.0,135.0,270.0,405.0,253.0,0.571
6451,Tyler Zeller,2017.0,178.0,42.0,43.0,81.0,124.0,78.0,0.494
6452,Stephen Zimmerman,2017.0,23.0,4.0,11.0,24.0,35.0,10.0,0.323
6453,Paul Zipser,2017.0,240.0,36.0,15.0,110.0,125.0,88.0,0.398


In [64]:
dfgrouped = tenYears_df.groupby(['Player'])['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'FG', 'FG%'].sum().reset_index()
dfgrouped

  dfgrouped = tenYears_df.groupby(['Player'])['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'FG', 'FG%'].sum().reset_index()


Unnamed: 0,Player,PTS,AST,ORB,DRB,TRB,FG,FG%
0,A.J. Hammons,48.0,4.0,8.0,28.0,36.0,17.0,0.405
1,A.J. Price,1656.0,613.0,74.0,335.0,409.0,600.0,3.197
2,Aaron Brooks,7839.0,2485.0,323.0,1037.0,1360.0,2829.0,6.047
3,Aaron Gordon,1981.0,311.0,316.0,765.0,1081.0,760.0,1.374
4,Aaron Gray,1235.0,251.0,518.0,889.0,1407.0,509.0,5.596
...,...,...,...,...,...,...,...,...
1205,Zach LaVine,2817.0,666.0,72.0,529.0,601.0,1045.0,1.333
1206,Zach Randolph,14168.0,1578.0,2592.0,5426.0,8018.0,5647.0,6.114
1207,Zaza Pachulia,5211.0,1031.0,1710.0,2854.0,4564.0,1839.0,5.210
1208,Zoran Dragic,56.0,10.0,10.0,6.0,16.0,22.0,1.026


In [66]:
# saving output
dfgrouped.to_csv("output/traditional_stats_clean2.csv", encoding="utf-8", index=True, header=True)