In [1]:
import pandas as pd
import numpy as np
from scipy import linalg as la

In [2]:
df = pd.read_csv("2016-17_officialBoxScore.csv")

In [3]:
missing_data = df.isnull()
missing_data = missing_data.sum()
missing_data = missing_data[missing_data > 0]
missing_data

Series([], dtype: int64)

In [4]:
cols = df.columns.to_list()

In [5]:
df

Unnamed: 0,gmDate,gmTime,seasTyp,offLNm,offFNm,teamAbbr,teamConf,teamDiv,teamLoc,teamRslt,...,opptFIC40,opptOrtg,opptDrtg,opptEDiff,opptPlay%,opptAR,opptAST/TO,opptSTL/TO,poss,pace
0,2016-10-25,08:00,Regular,Lane,Karl,NY,East,Atlantic,Away,Loss,...,84.9585,116.5653,87.6731,28.8922,0.4592,20.8951,2.0667,80.0000,100.3729,100.3729
1,2016-10-25,08:00,Regular,Adams,Bennie,NY,East,Atlantic,Away,Loss,...,84.9585,116.5653,87.6731,28.8922,0.4592,20.8951,2.0667,80.0000,100.3729,100.3729
2,2016-10-25,08:00,Regular,Kennedy,Bill,NY,East,Atlantic,Away,Loss,...,84.9585,116.5653,87.6731,28.8922,0.4592,20.8951,2.0667,80.0000,100.3729,100.3729
3,2016-10-25,08:00,Regular,Lane,Karl,CLE,East,Central,Home,Win,...,41.6667,87.6731,116.5653,-28.8922,0.3478,12.9969,0.9444,33.3333,100.3729,99.9564
4,2016-10-25,08:00,Regular,Adams,Bennie,CLE,East,Central,Home,Win,...,41.6667,87.6731,116.5653,-28.8922,0.3478,12.9969,0.9444,33.3333,100.3729,99.9564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7375,2017-04-12,10:30,Regular,Adams,Bennie,NO,West,Southwest,Away,Win,...,57.1875,100.9166,103.9441,-3.0275,0.4271,15.9744,0.8800,16.0000,99.0917,98.6805
7376,2017-04-12,10:30,Regular,Davis,Marc,NO,West,Southwest,Away,Win,...,57.1875,100.9166,103.9441,-3.0275,0.4271,15.9744,0.8800,16.0000,99.0917,98.6805
7377,2017-04-12,10:30,Regular,Guthrie,David,POR,West,Northwest,Home,Loss,...,65.0415,103.9441,100.9166,3.0275,0.4444,15.3563,1.8182,118.1818,99.0917,99.0917
7378,2017-04-12,10:30,Regular,Adams,Bennie,POR,West,Northwest,Home,Loss,...,65.0415,103.9441,100.9166,3.0275,0.4444,15.3563,1.8182,118.1818,99.0917,99.0917


In [6]:
cols

['gmDate',
 'gmTime',
 'seasTyp',
 'offLNm',
 'offFNm',
 'teamAbbr',
 'teamConf',
 'teamDiv',
 'teamLoc',
 'teamRslt',
 'teamMin',
 'teamDayOff',
 'teamPTS',
 'teamAST',
 'teamTO',
 'teamSTL',
 'teamBLK',
 'teamPF',
 'teamFGA',
 'teamFGM',
 'teamFG%',
 'team2PA',
 'team2PM',
 'team2P%',
 'team3PA',
 'team3PM',
 'team3P%',
 'teamFTA',
 'teamFTM',
 'teamFT%',
 'teamORB',
 'teamDRB',
 'teamTRB',
 'teamPTS1',
 'teamPTS2',
 'teamPTS3',
 'teamPTS4',
 'teamPTS5',
 'teamPTS6',
 'teamPTS7',
 'teamPTS8',
 'teamTREB%',
 'teamASST%',
 'teamTS%',
 'teamEFG%',
 'teamOREB%',
 'teamDREB%',
 'teamTO%',
 'teamSTL%',
 'teamBLK%',
 'teamBLKR',
 'teamPPS',
 'teamFIC',
 'teamFIC40',
 'teamOrtg',
 'teamDrtg',
 'teamEDiff',
 'teamPlay%',
 'teamAR',
 'teamAST/TO',
 'teamSTL/TO',
 'opptAbbr',
 'opptConf',
 'opptDiv',
 'opptLoc',
 'opptRslt',
 'opptMin',
 'opptDayOff',
 'opptPTS',
 'opptAST',
 'opptTO',
 'opptSTL',
 'opptBLK',
 'opptPF',
 'opptFGA',
 'opptFGM',
 'opptFG%',
 'oppt2PA',
 'oppt2PM',
 'oppt2P%',
 'o

In [6]:
# choose the useful columns
df_use = df[['gmDate', 'teamAbbr', 'teamPTS', 'opptAbbr', 'opptPTS']]
df_use = df_use.drop_duplicates(keep='first').reset_index(drop = True)
df_use

Unnamed: 0,gmDate,teamAbbr,teamPTS,opptAbbr,opptPTS
0,2016-10-25,NY,88,CLE,117
1,2016-10-25,CLE,117,NY,88
2,2016-10-25,UTA,104,POR,113
3,2016-10-25,POR,113,UTA,104
4,2016-10-25,SA,129,GS,100
...,...,...,...,...,...
2455,2017-04-12,GS,109,LAL,94
2456,2017-04-12,SAC,95,LAC,115
2457,2017-04-12,LAC,115,SAC,95
2458,2017-04-12,NO,103,POR,100


In [7]:
# calculate the point difference between teams
games = []
for idx, row in df_use.iterrows():
    if row[2] < row[4]:
        loser = row[1]
        winner = row[3]
        ptsDiff = row[4] - row[2]
    else:
        loser = row[3]
        winner = row[1]
        ptsDiff = row[2] - row[4]
    if [winner, loser, ptsDiff] not in games:
        games.append([winner, loser, ptsDiff])

In [8]:
games

[['CLE', 'NY', 29],
 ['POR', 'UTA', 9],
 ['SA', 'GS', 29],
 ['IND', 'DAL', 9],
 ['MIA', 'ORL', 12],
 ['BOS', 'BKN', 5],
 ['TOR', 'DET', 18],
 ['MEM', 'MIN', 4],
 ['CHA', 'MIL', 11],
 ['DEN', 'NO', 5],
 ['OKC', 'PHI', 6],
 ['SAC', 'PHO', 19],
 ['LAL', 'HOU', 6],
 ['ATL', 'WAS', 15],
 ['CHI', 'BOS', 6],
 ['LAC', 'POR', 8],
 ['SA', 'SAC', 8],
 ['CLE', 'TOR', 3],
 ['BKN', 'IND', 9],
 ['DET', 'ORL', 26],
 ['CHA', 'MIA', 6],
 ['OKC', 'PHO', 3],
 ['HOU', 'DAL', 8],
 ['UTA', 'LAL', 7],
 ['GS', 'NO', 8],
 ['ATL', 'PHI', 32],
 ['BOS', 'CHA', 6],
 ['CLE', 'ORL', 6],
 ['NY', 'MEM', 7],
 ['CHI', 'IND', 17],
 ['MIL', 'BKN', 2],
 ['SA', 'NO', 19],
 ['POR', 'DEN', 2],
 ['SAC', 'MIN', 3],
 ['LAC', 'UTA', 13],
 ['DET', 'MIL', 15],
 ['SA', 'MIA', 7],
 ['GS', 'PHO', 6],
 ['MEM', 'WAS', 9],
 ['OKC', 'LAL', 17],
 ['HOU', 'DAL', 1],
 ['ATL', 'SAC', 11],
 ['CHI', 'BKN', 30],
 ['TOR', 'DEN', 3],
 ['LAC', 'PHO', 18],
 ['CLE', 'HOU', 8],
 ['IND', 'LAL', 7],
 ['ORL', 'PHI', 2],
 ['DET', 'NY', 13],
 ['MIA', 'SAC',

In [9]:
for i in games:
    if i[0] == "CLE":
        print(i)

['CLE', 'NY', 29]
['CLE', 'TOR', 3]
['CLE', 'ORL', 6]
['CLE', 'HOU', 8]
['CLE', 'BOS', 6]
['CLE', 'PHI', 1]
['CLE', 'WAS', 11]
['CLE', 'CHA', 7]
['CLE', 'TOR', 4]
['CLE', 'DET', 23]
['CLE', 'POR', 12]
['CLE', 'DAL', 38]
['CLE', 'PHI', 4]
['CLE', 'NY', 32]
['CLE', 'MIA', 30]
['CLE', 'CHA', 11]
['CLE', 'MEM', 17]
['CLE', 'LAL', 11]
['CLE', 'MIL', 6]
['CLE', 'MIL', 11]
['CLE', 'BKN', 20]
['CLE', 'GS', 1]
['CLE', 'CHA', 12]
['CLE', 'NO', 8]
['CLE', 'BKN', 8]
['CLE', 'PHO', 4]
['CLE', 'SAC', 12]
['CLE', 'PHO', 15]
['CLE', 'OKC', 16]
['CLE', 'MIN', 28]
['CLE', 'NY', 7]
['CLE', 'WAS', 5]
['CLE', 'IND', 15]
['CLE', 'DEN', 16]
['CLE', 'MIN', 8]
['CLE', 'IND', 9]
['CLE', 'NY', 15]
['CLE', 'MIL', 7]
['CLE', 'ATL', 5]
['CLE', 'ORL', 12]
['CLE', 'DET', 32]
['CLE', 'UTA', 8]
['CLE', 'LAL', 5]
['CLE', 'PHI', 17]
['CLE', 'IND', 5]
['CLE', 'ORL', 20]
['CLE', 'BOS', 23]


In [12]:
class Node:
    
    def __init__(self, name):
        self.name = name
        self.losses = {}
        
    def add_loss(self, oppteam, ptsDiff):
        self.losses[oppteam] = self.losses.get(oppteam, 0) + ptsDiff

In [13]:
nodes = {}
for i in games:
    winner = i[0]
    loser = i[1]
    ptsDiff = i[2]
    nodes[loser] = nodes.get(loser, Node(loser))
    nodes[loser].add_loss(winner, ptsDiff)

In [14]:
# Create Matrix
size = len(nodes.keys())
TM = np.zeros((size,size))
size

30

In [15]:
teams = sorted(nodes.keys())
id_team = {}

In [16]:
for i in range(len(teams)):
    id_team[teams[i]] = i

In [17]:
for i in range(len(teams)):
    node = nodes[teams[i]]
    for j in node.losses.keys():
        ptsDiff = node.losses[j]
        id_col = id_team[j]
        TM[i][id_col] = float(ptsDiff)

In [18]:
TM

array([[ 0., 24.,  2., 27.,  2.,  5.,  0.,  0., 59., 13.,  0., 19., 25.,
        22., 12., 41.,  3., 22., 18., 10.,  3., 30.,  0.,  2., 16.,  8.,
         1., 44., 52., 33.],
       [25.,  0., 36., 11., 71., 28., 16., 20.,  1., 33., 29., 42., 32.,
         7.,  9., 20., 33., 20.,  9., 19., 37., 15., 16.,  0., 34., 55.,
        17., 49., 27., 53.],
       [23.,  0.,  0.,  0.,  7., 29.,  0., 36.,  7., 16.,  1.,  0., 14.,
         0.,  0.,  0.,  3.,  0.,  1., 11.,  8.,  0.,  6.,  3.,  4., 13.,
        16., 25.,  0., 40.],
       [27.,  2., 31.,  0., 13., 30.,  0.,  0., 30., 25., 19., 37.,  8.,
         0., 15., 20., 10.,  5.,  8.,  5.,  0.,  0., 15., 17., 17., 22.,
         3.,  7.,  7., 20.],
       [22.,  1., 27., 12.,  0.,  0., 26., 21., 25., 31., 25., 27., 17.,
         6.,  7., 12., 57., 33.,  0., 37., 15.,  7., 10., 18.,  2., 19.,
         0.,  2.,  0., 17.],
       [19.,  0.,  4.,  0., 36.,  0.,  7., 13., 21., 35.,  5., 10., 49.,
         0.,  8., 39., 17.,  0.,  2.,  0.,  9.,  0.,

In [19]:
# Markov Matrix
MM = TM/TM.sum(axis = 1, keepdims=True)

In [20]:
MM

array([[0.        , 0.04868154, 0.0040568 , 0.05476673, 0.0040568 ,
        0.01014199, 0.        , 0.        , 0.11967546, 0.02636917,
        0.        , 0.03853955, 0.05070994, 0.04462475, 0.02434077,
        0.0831643 , 0.00608519, 0.04462475, 0.03651116, 0.02028398,
        0.00608519, 0.06085193, 0.        , 0.0040568 , 0.03245436,
        0.01622718, 0.0020284 , 0.08924949, 0.10547667, 0.06693712],
       [0.03272251, 0.        , 0.04712042, 0.01439791, 0.09293194,
        0.03664921, 0.02094241, 0.02617801, 0.0013089 , 0.04319372,
        0.03795812, 0.05497382, 0.04188482, 0.0091623 , 0.0117801 ,
        0.02617801, 0.04319372, 0.02617801, 0.0117801 , 0.02486911,
        0.04842932, 0.01963351, 0.02094241, 0.        , 0.04450262,
        0.07198953, 0.02225131, 0.06413613, 0.03534031, 0.06937173],
       [0.08745247, 0.        , 0.        , 0.        , 0.02661597,
        0.11026616, 0.        , 0.13688213, 0.02661597, 0.0608365 ,
        0.00380228, 0.        , 0.05323194, 0.

In [21]:
# PageRank
w, vl, vr = la.eig(MM, left = True)
vl = np.absolute(vl[:,0].T)

In [22]:
rankings = []
teams = sorted(id_team.keys())

In [23]:
for i in range(30):
    index = np.argmax(vl)
    rankings.append(teams[index])
    vl[index] = 0

In [24]:
rankings = rankings[:15]

In [25]:
df_pagerank = pd.DataFrame(rankings, columns = ["team"])

In [26]:
df_pagerank

Unnamed: 0,team
0,GS
1,SA
2,LAC
3,MEM
4,UTA
5,HOU
6,DEN
7,TOR
8,CLE
9,OKC


In [27]:
df_standings = pd.read_csv("2016-17_standings.csv")

In [28]:
df_standings

Unnamed: 0,stDate,teamAbbr,rank,rankOrd,gameWon,gameLost,stk,stkType,stkTot,gameBack,...,rel%Indx,mov,srs,pw%,pyth%13.91,wpyth13.91,lpyth13.91,pyth%16.5,wpyth16.5,lpyth16.5
0,2016-10-25,ATL,2,2nd,0,0,-,-,0,0.5,...,0.000000,0.0000,0.0000,0.5000,0.0000,0.0000,82.0000,0.0000,0.0000,82.0000
1,2016-10-25,BKN,2,2nd,0,0,-,-,0,0.5,...,0.000000,0.0000,0.0000,0.5000,0.0000,0.0000,82.0000,0.0000,0.0000,82.0000
2,2016-10-25,BOS,2,2nd,0,0,-,-,0,0.5,...,0.000000,0.0000,0.0000,0.5000,0.0000,0.0000,82.0000,0.0000,0.0000,82.0000
3,2016-10-25,CHA,2,2nd,0,0,-,-,0,0.5,...,0.000000,0.0000,0.0000,0.5000,0.0000,0.0000,82.0000,0.0000,0.0000,82.0000
4,2016-10-25,CHI,2,2nd,0,0,-,-,0,0.5,...,0.000000,0.0000,0.0000,0.5000,0.0000,0.0000,82.0000,0.0000,0.0000,82.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,2017-04-12,SA,2,2nd,61,21,L3,loss,3,6.0,...,0.559475,7.1951,6.6971,0.7369,0.7279,59.6878,22.3122,0.7627,62.5414,19.4586
4796,2017-04-12,SAC,12,12th,32,50,L1,loss,1,35.0,...,0.478375,-3.8780,-4.3858,0.3723,0.3740,30.6680,51.3320,0.3518,28.8476,53.1524
4797,2017-04-12,TOR,2,2nd,51,31,W4,win,4,2.0,...,0.524750,4.2073,3.7150,0.6385,0.6362,52.1684,29.8316,0.6599,54.1118,27.8882
4798,2017-04-12,UTA,4,4th,51,31,W2,win,2,16.0,...,0.530725,3.9390,3.4387,0.6297,0.6353,52.0946,29.9054,0.6589,54.0298,27.9702


In [29]:
df_rank = df_standings[['stDate', 'teamAbbr', 'rank']]

In [30]:
df_rank

Unnamed: 0,stDate,teamAbbr,rank
0,2016-10-25,ATL,2
1,2016-10-25,BKN,2
2,2016-10-25,BOS,2
3,2016-10-25,CHA,2
4,2016-10-25,CHI,2
...,...,...,...
4795,2017-04-12,SA,2
4796,2017-04-12,SAC,12
4797,2017-04-12,TOR,2
4798,2017-04-12,UTA,4


In [31]:
ranked = df_rank.groupby("teamAbbr").mean()

In [32]:
df_ranked = ranked.sort_values("rank", ascending = True)

In [33]:
df_ranked

Unnamed: 0_level_0,rank
teamAbbr,Unnamed: 1_level_1
CLE,1.0625
GS,1.48125
SA,2.0625
TOR,2.9125
BOS,3.13125
HOU,3.51875
LAC,3.7625
ATL,4.86875
UTA,5.175
OKC,5.69375
