# Part I

Need data to talk about data, and a model to talk about models...

### The Data

In [1]:
import pandas as pd

df = pd.read_csv('data/basketball.csv', parse_dates=[4])
df = df.sort_values(['name', 'date']).reset_index(drop=True)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15345 entries, 0 to 15344
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   name      15345 non-null  object        
 1   position  15345 non-null  object        
 2   minutes   15345 non-null  int64         
 3   points    15345 non-null  int64         
 4   date      15345 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 599.5+ KB


In [3]:
df.shape

(15345, 5)

In [4]:
df.sample(10)

Unnamed: 0,name,position,minutes,points,date
9114,L. Galloway,G,3,0,2021-01-03
9607,M. Carter-Williams,G,0,0,2021-03-19
909,Ben McLemore,G,0,0,2021-02-26
9758,M. Robinson,C-F,0,0,2021-03-11
4232,Ed Davis,C-F,0,0,2021-02-05
15294,Zeke Nnaji,F-C,0,0,2021-02-06
2002,Chris Paul,G,29,11,2020-12-31
1482,C. Felicio,F-C,0,0,2021-03-17
4936,Gary Trent,G-F,35,17,2021-03-11
12320,Reggie Jackson,G,13,5,2020-12-25


In [5]:
df[df['name'] == 'LeBron James'].head(3)

Unnamed: 0,name,position,minutes,points,date
9297,LeBron James,F,28,22,2020-12-22
9298,LeBron James,F,31,22,2020-12-25
9299,LeBron James,F,26,18,2020-12-27


In [6]:
lebron = df[df['name'] == 'LeBron James'].copy()

In [8]:
lebron['points_1'] = lebron['points'].shift(1)
lebron['points_2'] = lebron['points'].shift(2)

In [9]:
lebron.head(10)

Unnamed: 0,name,position,minutes,points,date,points_1,points_2
9297,LeBron James,F,28,22,2020-12-22,,
9298,LeBron James,F,31,22,2020-12-25,22.0,
9299,LeBron James,F,26,18,2020-12-27,22.0,22.0
9300,LeBron James,F,35,29,2020-12-28,18.0,22.0
9301,LeBron James,F,35,26,2020-12-30,29.0,18.0
9302,LeBron James,F,35,26,2021-01-01,26.0,29.0
9303,LeBron James,F,32,22,2021-01-03,26.0,26.0
9304,LeBron James,F,34,26,2021-01-05,22.0,26.0
9305,LeBron James,F,34,27,2021-01-07,26.0,22.0
9306,LeBron James,F,34,28,2021-01-08,27.0,26.0


In [10]:
df['points_1'] = df.groupby('name')['points'].shift(1)
df['points_2'] = df.groupby('name')['points'].shift(2)

In [11]:
df = df.dropna(subset=["points_1", "points_2"])

### The Objective

<font color="red">0 to 💯 real quick</font>

Predict \*points\* next game based on the points score for the last two games

In [12]:
target = 'points'
y = df[target]
X = df[['position', 'points_1', 'points_2']]

In [13]:
from sklearn.model_selection import train_test_split

`train_test_split` on time series data is a little different...

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.10, 
    random_state=42, 
    shuffle=False
)

### The Model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
# new
from sklearn_pandas import DataFrameMapper

In [16]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [17]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [18]:
model = LinearRegression()
model.fit(Z_train, y_train)

LinearRegression()

In [19]:
model.score(Z_train, y_train)

0.4782386705540024

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
mean_squared_error(y_test, model.predict(Z_test)) ** (1/2)

5.936184414953741

### DataFrameMapper (How + Why)

https://github.com/scikit-learn-contrib/sklearn-pandas

```pip install sklearn-pandas```

In [22]:
demo = pd.DataFrame({
    'position': ['C', 'SF', 'SG', 'PG']
})

pd.get_dummies(demo)

Unnamed: 0,position_C,position_PG,position_SF,position_SG
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,0,1,0,0


In [23]:
demo_2 = pd.DataFrame({
    'position': ['C', 'SF-SG', 'SG', 'C']
})

pd.get_dummies(demo_2)

Unnamed: 0,position_C,position_SF-SG,position_SG
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


In [24]:
new = pd.DataFrame({
    'position': ['🍔']
})

pd.get_dummies(new)

Unnamed: 0,position_🍔
0,1


### What to do instead...

In [25]:
X_train.sample(5)

Unnamed: 0,position,points_1,points_2
8781,G,0.0,0.0
11423,F,0.0,4.0
13071,F,24.0,5.0
8485,F,0.0,0.0
10962,G-F,0.0,0.0


In [26]:
lb = LabelBinarizer()
lb.fit(X_train['position'])
lb.transform(X_train['position'])

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [27]:
lb.classes_

array(['C', 'C-F', 'F', 'F-C', 'F-G', 'G', 'G-F', 'PG'], dtype='<U3')

In [28]:
new = pd.DataFrame({
    'position': ['🍔']
})

In [29]:
lb.transform(new['position'])

array([[0, 0, 0, 0, 0, 0, 0, 0]])

In [30]:
new = pd.DataFrame({
    'position': [None]
})

# lb.transform(new['position'])

In [31]:
mapper = DataFrameMapper([
    (['position'], [SimpleImputer(strategy="most_frequent"), LabelBinarizer()]),
    (['points_1'], [SimpleImputer(), StandardScaler()]), 
    (['points_2'], [SimpleImputer(), StandardScaler()]),
], df_out=True)

In [32]:
mapper.fit(X_train)
mapper.transform(X_train)[:10]

Unnamed: 0,position_C,position_C-F,position_F,position_F-C,position_F-G,position_G,position_G-F,position_PG,points_1,points_2
2,0,0,1,0,0,0,0,0,-0.454679,-0.806253
3,0,0,1,0,0,0,0,0,0.128367,-0.455934
4,0,0,1,0,0,0,0,0,-0.804506,0.127931
5,0,0,1,0,0,0,0,0,-0.804506,-0.806253
6,0,0,1,0,0,0,0,0,-0.804506,-0.806253
7,0,0,1,0,0,0,0,0,-0.454679,-0.806253
8,0,0,1,0,0,0,0,0,-0.804506,-0.455934
9,0,0,1,0,0,0,0,0,-0.804506,-0.806253
10,0,0,1,0,0,0,0,0,-0.804506,-0.806253
11,0,0,1,0,0,0,0,0,-0.804506,-0.806253


But, maybe the best part about `mapper` is that you can put it in a pipeline...

In [33]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)
pipe.fit(X_train, y_train)

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(df_out=True,
                                 features=[(['position'],
                                            [SimpleImputer(strategy='most_frequent'),
                                             LabelBinarizer()]),
                                           (['points_1'],
                                            [SimpleImputer(),
                                             StandardScaler()]),
                                           (['points_2'],
                                            [SimpleImputer(),
                                             StandardScaler()])])),
                ('linearregression', LinearRegression())])

### The Pickle 🥒

In [34]:
import pickle

with open('pickles/pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [35]:
del pipe

In [36]:
with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [37]:
pipe.score(X_train, y_train)

0.4782386705540024

In [38]:
pipe.predict(X_train)[:10]

array([2.88867188, 6.0546875 , 4.09960938, 1.56054688, 1.56054688,
       2.88867188, 2.51367188, 1.56054688, 1.56054688, 1.56054688])

In [39]:
X_train.sample(1).to_dict(orient='list')

{'position': ['G'], 'points_1': [2.0], 'points_2': [0.0]}

In [40]:
new = pd.DataFrame({
    'position': ['F'], 
    'points_1': [9.0], 
    'points_2': [8.0]
})

In [41]:
pipe.predict(new)

array([8.08398438])