In [1]:
import pandas as pd
import math
import numpy as np
import itertools

In [2]:
#statsDF - initial stats dataframe
#gamesDF - initial games dataframe
#newGamesDF - formatted games dataframe with game date as index and rest of record in one row
#finalStats - formatted stats dataframe with player's name as the index
#mainDF: The dataframe where the main analysis will take place.

In [3]:
stats = '/home/acer/github/nba/stats.csv'
games = '/home/acer/github/nba/games.csv'

In [165]:
#Dump the stats dataframe into a csv and clean it up
cols = ['PLAYER', 'MIN', 'FGM','FGA', 'FG%', '3PM', '3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB','AST','TOV','STL','BLK','PF','PTS','+/-']
statsDF = pd.read_csv(stats, header= None, names=cols, index_col=False, error_bad_lines=False, delim_whitespace=True)
statsDF[cols] = statsDF[cols].replace({'\$': '', ',': ''}, regex=True)
statsDF['PLAYER'] = statsDF['PLAYER'].str.replace('.','')
statsDF['PLAYER'] = statsDF['PLAYER'].str.replace('-','')
statsDF['MIN'] = statsDF['MIN'].replace({'\$': '', '-': ''}, regex=True)
#Drop the first line for the team name
statsDF.drop(0,inplace=True)

In [166]:
#The nba.com site changed its formatting so this is to remove non-existent rows

new_format_mask = 0
for index, row in statsDF.iterrows():
    if new_format_mask == 1:
        statsDF.drop(index, inplace=True)
        if row.iloc[0] == 'Totals:' and math.isnan(row.iloc[1]) == True:
            new_format_mask = 0 
        else:
            continue
    else:
        if row.iloc[0] == 'PLAYER' and row.iloc[1] != "MIN":
            new_format_mask += 1
        else:
            continue

for index, row in statsDF.iterrows():
    if row.iloc[0][3:4] == ":":
        statsDF.drop(index, inplace=True)

    

In [167]:
statsDF = statsDF[statsDF.PLAYER != 'PLAYER']
statsDF = statsDF[statsDF.PLAYER != 'Totals:']
statsDF = statsDF[statsDF.PLAYER != 'INACTIVE']
statsDF = statsDF.reset_index(drop=True)

In [168]:
for index, row in statsDF.iterrows():
    try: 
        if row.iloc[1].isalpha() and statsDF.iloc[index + 1][1].isalpha():
            home_team_length = index/2
            statsDF.drop(index, inplace=True)
            statsDF = statsDF.reset_index(drop=True)
            break
    except:
        continue

In [170]:
#Setting up the index on the main stats dataframe
mainDFIndex = []
for index, row in statsDF.iterrows():
    if row.iloc[0].isalpha() == True and row.iloc[1].isalpha() == True:
        mainDFIndex.append(str(row['PLAYER']) + " " + str(row['MIN']))
print(mainDFIndex)

['Maurice Harkless', 'AlFarouq Aminu', 'Jusuf Nurkic', 'CJ McCollum', 'Damian Lillard', 'Pat Connaughton', 'Evan Turner', 'Ed Davis', 'Shabazz Napier', 'Zach Collins', 'Jake Layman', 'Meyers Leonard', 'Caleb Swanigan', 'Khris Middleton', 'Giannis Antetokounmpo', 'Thon Maker', 'Tony Snell', 'Malcolm Brogdon', 'DeAndre Liggins', 'Mirza Teletovic', 'Greg Monroe', 'Matthew Dellavedova', 'Jason Terry', 'John Henson', 'Sterling Brown', 'DJ Wilson']


In [176]:
#Setup dicts from the DF
statsDF_Dict = statsDF.T.to_dict().values()

In [183]:
dict_list = []
#Iterate through the statsDF and only return lines with stats or lines where players sat
for index, i in enumerate(statsDF_Dict):
    if index == 1 or index % 2 != 0:
        dict_list.append(i)
    

In [185]:
dict_list

[{'+/-': nan,
  '3P%': '1',
  '3PA': '50.0',
  '3PM': '4',
  'AST': '1',
  'BLK': '3',
  'DREB': '3',
  'FG%': '2',
  'FGA': '50.0',
  'FGM': '10',
  'FT%': '0',
  'FTA': '50.0',
  'FTM': '2',
  'MIN': '5',
  'OREB': '3',
  'PF': '13',
  'PLAYER': '37:50',
  'PTS': '1',
  'REB': '0',
  'STL': '0',
  'TOV': '3'},
 {'+/-': nan,
  '3P%': '2',
  '3PA': '33.3',
  '3PM': '3',
  'AST': '0',
  'BLK': '1',
  'DREB': '6',
  'FG%': '1',
  'FGA': '14.3',
  'FGM': '7',
  'FT%': '1',
  'FTA': '100',
  'FTM': '2',
  'MIN': '1',
  'OREB': '5',
  'PF': '5',
  'PLAYER': '25:35',
  'PTS': '-1',
  'REB': '2',
  'STL': '1',
  'TOV': '0'},
 {'+/-': nan,
  '3P%': '3',
  '3PA': '0.0',
  '3PM': '1',
  'AST': '2',
  'BLK': '1',
  'DREB': '11',
  'FG%': '0',
  'FGA': '41.2',
  'FGM': '17',
  'FT%': '5',
  'FTA': '75.0',
  'FTM': '4',
  'MIN': '7',
  'OREB': '6',
  'PF': '17',
  'PLAYER': '31:58',
  'PTS': '-7',
  'REB': '3',
  'STL': '3',
  'TOV': '0'},
 {'+/-': nan,
  '3P%': '5',
  '3PA': '50.0',
  '3PM': '6',


In [187]:
finalStats = pd.DataFrame(dict_list)
finalStats.index = mainDFIndex
finalStats.drop(['+/-'], axis=1, inplace=True)
finalStats = finalStats.rename(columns={'PLAYER':'MIN','MIN':'FGM', 'FGM':'FGA', 'FGA':'FG%', 'FG%':'3PM', '3PM':'3PA', '3PA':'3P%', '3P%':'FTM', 'FTM':'FTA', 'FTA':'FT%', 'FT%':'OREB', 'OREB':'DREB', 'DREB':'REB', 'REB':'AST', 'AST':'TOV', 'TOV':'BLK', 'BLK':'PF', 'PF':'PTS', 'PTS':'+/-'})
#This will replace the nan's with DNP
finalStats = finalStats.fillna('NA')

In [188]:
finalStats

Unnamed: 0,FTM,3P%,3PA,TOV,PF,REB,3PM,FG%,FGA,OREB,FT%,FTA,FGM,DREB,PTS,MIN,+/-,AST,STL,BLK
Maurice Harkless,1.0,50.0,4.0,1.0,3.0,3.0,2.0,50.0,10,0.0,50.0,2.0,5.0,3.0,13.0,37:50,1.0,0.0,0.0,3.0
AlFarouq Aminu,2.0,33.3,3.0,0.0,1.0,6.0,1.0,14.3,7,1.0,100.0,2.0,1.0,5.0,5.0,25:35,-1.0,2.0,1.0,0.0
Jusuf Nurkic,3.0,0.0,1.0,2.0,1.0,11.0,0.0,41.2,17,5.0,75.0,4.0,7.0,6.0,17.0,31:58,-7.0,3.0,3.0,0.0
CJ McCollum,5.0,50.0,6.0,2.0,2.0,3.0,3.0,50.0,18,0.0,100.0,5.0,9.0,3.0,26.0,37:03,1.0,1.0,0.0,2.0
Damian Lillard,11.0,50.0,6.0,6.0,5.0,5.0,3.0,40.0,15,2.0,91.7,12.0,6.0,3.0,26.0,34:27,-5.0,4.0,0.0,4.0
Pat Connaughton,0.0,33.3,3.0,2.0,1.0,0.0,1.0,25.0,4,0.0,0.0,0.0,1.0,0.0,3.0,18:31,-2.0,1.0,0.0,0.0
Evan Turner,4.0,25.0,4.0,2.0,5.0,2.0,1.0,42.9,7,0.0,100.0,4.0,3.0,2.0,11.0,32:27,-4.0,7.0,0.0,1.0
Ed Davis,4.0,0.0,0.0,1.0,4.0,11.0,0.0,100,1,4.0,100.0,4.0,1.0,7.0,6.0,16:02,4.0,0.0,2.0,0.0
Shabazz Napier,1.0,0.0,1.0,0.0,0.0,1.0,0.0,50.0,2,1.0,50.0,2.0,1.0,0.0,3.0,6:07,-2.0,0.0,0.0,0.0
Zach Collins,,,,,,,,DECISION,COACH'S,,,,,,,DNP,,,,


In [189]:
#GAMES CELL: This prepares the games section into a dataframe
#Passing in csv file and getting rid of the unused rows
gamesDF = pd.read_csv(games, header= None, names=['a','b','c','d','e','f','g','h','i'], index_col=False, error_bad_lines=False, delim_whitespace=True)
gamesDF.columns = [col.replace(',', '') for col in gamesDF.columns]
#if you want to operate on multiple columns, put them in a list like so:
games_cols = ['a','b','c','d','e','f','g','h','i']
# pass them to df.replace(), specifying each char and it's replacement:
gamesDF[games_cols] = gamesDF[games_cols].replace({'\$': '', ',': ''}, regex=True)
# This will add the winner to column b of each 26th row
for index, row in gamesDF.iterrows():
    if index % 26 == 0 or index == 0:
        if int(gamesDF['a'].iloc[index+4]) > int(gamesDF['a'].iloc[index + 8]):
            gamesDF['b'].iloc[index] = gamesDF['a'].iloc[index + 1]
        else:
            gamesDF['b'].iloc[index] = gamesDF['a'].iloc[index + 5]
    else:
        pass

In [147]:
#This cell will arrange the gamesDF into the actual formatted games dataframe, newGamesDF
#Setting up the index on the main stats dataframe
dateIndex = []
month = ['OCT', 'NOV', 'DEC', 'JAN', 'FEB']
for index, row in gamesDF.iterrows():
    for i in month:
        if gamesDF.iloc[index][0] == i:
            dateIndex.append(gamesDF.iloc[9,0] + " " + gamesDF.iloc[9,1][1] + ", " + gamesDF.iloc[9,2])
#The next lines will pull the data from the cells in the gamesDF and into the newGamesDF dataframe
gamesData = []
games = 0
for index, row in gamesDF.iterrows():
    if index == 0 or index % 26 == 0:
        gamesData.append(gamesDF.iloc[index+11,0])
        gamesData.append(gamesDF.iloc[index+12,0])
        gamesData.append(gamesDF.iloc[index,1])
        gamesData.append(gamesDF.iloc[index+11,1])
        gamesData.append(gamesDF.iloc[index+11,2])
        gamesData.append(gamesDF.iloc[index+11,3])
        gamesData.append(gamesDF.iloc[index+11,4])
        gamesData.append(gamesDF.iloc[index+12,1])
        gamesData.append(gamesDF.iloc[index+12,2])
        gamesData.append(gamesDF.iloc[index+12,3])
        gamesData.append(gamesDF.iloc[index+12,4])
        gamesData.append(gamesDF.iloc[index+8,0])
        gamesData.append(gamesDF.iloc[index+4,0])
        gamesData.append(gamesDF.iloc[index+22,2])
        gamesData.append(gamesDF.iloc[index+22,4])
        gamesData.append(gamesDF.iloc[index+22,6])
        games += 1
        
gamesData = np.array(gamesData)
gamesData = gamesData.reshape(games,16)
#Create an empty dataframe, based on the games date as the index
newGamesDF = pd.DataFrame(data = gamesData, columns=['Home','Away','W/L','1st Qtr H','2nd Qtr H','3rd Qtr H','4th Qtr H','1st Qtr A','2nd Qtr A','3rd Qtr A','4th Qtr A','Total H','Total A', 'Ref1','Ref2','Ref3'])
newGamesDF

Unnamed: 0,Home,Away,W/L,1st Qtr H,2nd Qtr H,3rd Qtr H,4th Qtr H,1st Qtr A,2nd Qtr A,3rd Qtr A,4th Qtr A,Total H,Total A,Ref1,Ref2,Ref3
0,POR,MIL,MILWAUKEE,26,29,25,30,24,36,24,29,113,110,"Zarba""""","Barnaky""""",Voyard-Tadal


In [148]:
for index,row in finalStats.iterrows():
    finalStats['1st Qtr H'] = newGamesDF['1st Qtr H'][0]
    finalStats['2nd Qtr H'] = newGamesDF['2nd Qtr H'][0]
    finalStats['3rd Qtr H'] = newGamesDF['3rd Qtr H'][0]
    finalStats['4th Qtr H'] = newGamesDF['4th Qtr H'][0]
    finalStats['1st Qtr A'] = newGamesDF['1st Qtr A'][0]
    finalStats['2nd Qtr A'] = newGamesDF['2nd Qtr A'][0]
    finalStats['3rd Qtr A'] = newGamesDF['3rd Qtr A'][0]
    finalStats['4th Qtr A'] = newGamesDF['4th Qtr A'][0]
    finalStats['Total H'] = newGamesDF['Total H'][0]
    finalStats['Total A'] = newGamesDF['Total A'][0]
    finalStats['Ref1'] = newGamesDF['Ref1'][0]
    finalStats['Ref2'] = newGamesDF['Ref2'][0]
    finalStats['Ref3'] = newGamesDF['Ref3'][0]
    finalStats['W/L'] = newGamesDF['W/L'][0]
    finalStats['Home'] = newGamesDF['Home'][0]
    finalStats['Away'] = newGamesDF['Away'][0]
    

In [149]:
#This function will calculate the projected fantasy points per game
def fantasy_points(row):
    try:
        score = (int(row['3PM']) * 3) + (int(row['FGM']) * 2) + (int(row['REB']) * 1) + (int(row['AST']) * 1.5) + (int(row['BLK']) * 3) + (int(row['STL']) * 3) + (int(row['TOV']) * -1)
        return score
    except:
        pass

finalStats['Fantasy Score'] = finalStats.apply(fantasy_points, axis=1)
finalStats = finalStats.fillna('NA')

In [150]:
finalStats['Player']

KeyError: 'Player'

In [156]:
len(newDict)

22

In [179]:
statsDF

Unnamed: 0,PLAYER,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,...,OREB,DREB,REB,AST,TOV,STL,BLK,PF,PTS,+/-
0,Maurice,Harkless,F,,,,,,,,...,,,,,,,,,,
1,37:50,5,10,50.0,2.0,4.0,50.0,1.0,2.0,50.0,...,3.0,3.0,0.0,1.0,3.0,0.0,3.0,13.0,1.0,
2,Al-Farouq,Aminu,F,,,,,,,,...,,,,,,,,,,
3,25:35,1,7,14.3,1.0,3.0,33.3,2.0,2.0,100.0,...,5.0,6.0,2.0,0.0,0.0,1.0,1.0,5.0,-1.0,
4,Jusuf,Nurkic,C,,,,,,,,...,,,,,,,,,,
5,31:58,7,17,41.2,0.0,1.0,0.0,3.0,4.0,75.0,...,6.0,11.0,3.0,2.0,0.0,3.0,1.0,17.0,-7.0,
6,CJ,McCollum,G,,,,,,,,...,,,,,,,,,,
7,37:03,9,18,50.0,3.0,6.0,50.0,5.0,5.0,100.0,...,3.0,3.0,1.0,2.0,2.0,0.0,2.0,26.0,1.0,
8,Damian,Lillard,G,,,,,,,,...,,,,,,,,,,
9,34:27,6,15,40.0,3.0,6.0,50.0,11.0,12.0,91.7,...,3.0,5.0,4.0,6.0,4.0,0.0,5.0,26.0,-5.0,
