In [2]:
import pandas as pd
import os
import glob

In [3]:
# Specify the path to your main folder containing subfolders with CSV files
main_path = r"C:\Users\vaugh\Desktop\basketball-pf-research"

# Recursively search for all CSV files in the subfolders
all_files = [f for f in glob.glob(os.path.join(main_path, "**/*.csv"), recursive=True)]

shooting_stats_dfs = {}
advanced_stats_dfs = {}
tracking_stats_dfs = {}


# Combine all files into a single DataFrame
df_list = []
for filename in all_files:
    filename = filename.split('\\')[-1]
    df = pd.read_csv(filename)
    if 'shooting' in filename:
        shooting_stats_dfs[filename[:-4]] = pd.read_csv(filename)
    elif 'advanced' in filename:
        advanced_stats_dfs[filename[:-4]] = pd.read_csv(filename)
    else:
        tracking_stats_dfs[filename[:-4]] = pd.read_csv(filename)

In [9]:
import difflib

def clean_tracking_dfs(df):
    # Drop unneccessary columns in the nba.com dataframes
    col_to_drop = ['Unnamed: 0', 'PTS%', 'FT%', 'Contested OREB', 'Contested OREB%', 'OREB Chances', 'Deferred OREB Chances', 'Adjusted OREB Chance%', 'AVG OREB Distance', 'Contested DREB', 'Contested DREB%', 'DREB Chances', 'Deferred DREB Chances', 'Adjusted DREB Chance%', 'AVG DREB Distance', 'DFGM', 'DFGA', 'DFG%']
    for col in col_to_drop:
        close_matches = difflib.get_close_matches(col, df.columns, n=1)
        df.drop(columns=close_matches, inplace=True)
    
    # Rename columns in the nba.com dataframes for better understanding
    df.rename(columns = {'FGM': 'FGM - DRIVES', 'FGA': 'FGA - DRIVES', 'FG%': 'FG% - DRIVES', 'FTM': 'FTM - DRIVES', 'FTA': 'FTA - DRIVES', 'PASS': 'PASS - DRIVES', 'PASS%': 'PASS% - DRIVES', 'AST': 'AST - DRIVES', 'AST%': 'AST% - DRIVES', 'TO': 'TO - DRIVES', 'TOV%': 'TOV% - DRIVES', 'PF': 'PF - DRIVES', 'PF%': 'PF% - DRIVES'}, inplace=True)
    
    # Adjusting some of the stats and removing the remaining unneccesary stat columns
    df['FGA per drive'] = (df['FGA - DRIVES'] / df['DRIVES']).round(3)
    df['FTA per drive'] = (df['FTA - DRIVES'] / df['DRIVES']).round(3)
    df['AST per drive'] = (df['AST - DRIVES'] / df['DRIVES']).round(3)
    df['TOV per drive'] = (df['TO - DRIVES'] / df['DRIVES']).round(3)
    
    more_col_to_drop = ['W', 'L', 'PTS PER ELBOW TOUCH', 'PTS PER POST TOUCH', 'PTS PER PAINT TOUCH', 'FGM - DRIVES', 'FGA - DRIVES', 'FG% - DRIVES', 'FTM - DRIVES', 'FTA - DRIVES', 'PASS - DRIVES', 'AST - DRIVES', 'TO - DRIVES', 'PF - DRIVES', 'PF% - DRIVES', 'eFG%']

    for col in more_col_to_drop:
        close_matches = difflib.get_close_matches(col, df.columns, n=1)
        df.drop(columns=close_matches, inplace=True)

# Clean all of the tracking stat dataframes from nba.com
for df in tracking_stats_dfs:
    clean_tracking_dfs(tracking_stats_dfs[df])

In [11]:
tracking_stats_dfs['2017-18']

Unnamed: 0,PLAYER,TEAM,GP,MIN,PTS,TOUCHES,FRONT CT TOUCHES,TIME OF POSS,AVG SEC PER TOUCH,AVG DRIB PER TOUCH,...,OREB,OREB Chance%,DREB,DREB Chance%,STL,BLK,FGA per drive,FTA per drive,AST per drive,TOV per drive
0,Aaron Gordon,ORL,58,32.9,17.6,59.9,31.8,2.4,2.44,1.65,...,1.5,33.5,6.4,64.2,1.0,0.8,0.481,0.192,0.058,0.096
1,Abdel Nader,BOS,47,11.1,3.1,13.3,9.9,0.5,2.31,1.46,...,0.5,31.0,2.2,68.2,0.3,0.2,0.455,0.045,0.091,0.136
2,Adreian Payne,ORL,5,8.5,4.2,12.4,6.8,0.4,1.77,0.68,...,0.7,40.0,2.3,63.6,0.4,0.0,1.000,0.000,0.000,0.000
3,Al Horford,BOS,69,31.5,12.8,63.5,40.0,2.3,2.20,1.06,...,1.4,38.8,5.9,62.5,0.6,1.1,0.448,0.069,0.103,0.034
4,Al-Farouq Aminu,POR,69,30.0,9.3,36.0,19.2,1.2,1.97,1.00,...,1.4,37.2,6.2,63.7,1.1,0.6,0.387,0.129,0.065,0.097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,Travis Wear,LAL,17,13.4,4.4,19.5,9.8,0.4,1.38,0.53,...,0.0,0.0,2.6,59.0,0.2,0.3,0.250,0.250,0.000,0.000
242,Treveon Graham,CHA,63,16.7,4.3,19.5,11.7,0.5,1.68,0.83,...,0.8,38.2,1.5,44.6,0.5,0.0,0.444,0.167,0.111,0.000
243,Trevor Ariza,HOU,67,33.9,11.7,38.0,26.5,1.1,1.80,0.87,...,0.5,19.1,4.0,62.3,1.5,0.2,0.371,0.086,0.114,0.057
244,Trevor Booker,IND,67,17.1,6.4,34.0,19.3,0.9,1.66,0.60,...,1.8,40.5,3.1,54.1,0.4,0.3,0.615,0.077,0.077,0.077


In [15]:
# Dropping an unneccessary column in the rest of the dataframes
for df in advanced_stats_dfs:
    advanced_stats_dfs[df].drop(columns = ['Unnamed: 0'], inplace=True)
for df in shooting_stats_dfs:
    shooting_stats_dfs[df].drop(columns = ['Unnamed: 0'], inplace=True)

In [16]:
shooting_stats_dfs['2014shooting'].columns

Index(['Player', 'G', 'MP', 'FG%', 'Avg FG Distance',
       '% of FGA by Distance - 2P', '% of FGA by Distance - 0-3',
       '% of FGA by Distance - 3-10', '% of FGA by Distance - 10-16',
       '% of FGA by Distance - 16-3P', '% of FGA by Distance - 3P',
       'FG% by Distance - 2P', 'FG% by Distance - 0-3',
       'FG% by Distance - 3-10', 'FG% by Distance - 10-16',
       'FG% by Distance - 16-3P', 'FG% by Distance - 3P', '% of FG Ast'd - 2P',
       '% of FG Ast'd - 3P', 'Corner 3s - %3PA', 'Corner 3s - 3P%'],
      dtype='object')

In [17]:
advanced_stats_dfs['2014advanced']

Unnamed: 0,Player,G,MP,TS%,3PAr,FTr,USG%
0,Al Harrington,34,511,0.506,0.483,0.169,22.7
1,Amir Johnson,77,2214,0.592,0.108,0.234,16.4
2,Andray Blatche,73,1618,0.532,0.080,0.328,25.6
3,Andrei Kirilenko,45,857,0.532,0.031,0.744,14.6
4,Andrew Nicholson,76,1174,0.489,0.212,0.136,19.3
...,...,...,...,...,...,...,...
93,Trevor Booker,72,1553,0.563,0.005,0.189,14.4
94,Tristan Thompson,82,2594,0.528,0.001,0.445,17.5
95,Tyler Hansbrough,64,978,0.562,0.010,0.944,15.1
96,Udonis Haslem,46,653,0.523,0.000,0.301,13.9
