In [209]:
import pandas as pd
import numpy as np

In [210]:
### Some stuff i learned about data:

## in player_raw::
# element_type indicates position as integer
# status is whether the player is available
# team indicates team (alphabetically) 
# team_id = ???
# id is the same id as above


## in player_df::
# opponent_team is id'd alphabetically
# each player is id'd by element
# gw indicates gameweek

In [211]:
player_df = pd.read_csv('/Users/petter/Documents/FPL/data/player_data/gw_17.csv')
team_df = pd.read_csv('/Users/petter/Documents/FPL/data/team_data/E0_17.csv')

#This list translates id ('element' in player_df) to name
player_idlist = pd.read_csv('/Users/petter/Documents/FPL/FPL_scraper/data/2017-18/player_idlist.csv')

player_raw = pd.read_csv('/Users/petter/Documents/FPL/FPL_scraper/data/2017-18/players_raw.csv')



In [212]:

## Prepare a dictionary and df to identify the teams by id
pl_teams = team_df.HomeTeam.unique()
pl_teams.sort()

team_dict = {name:id_nr for name,id_nr in zip(pl_teams,list(range(1,len(pl_teams)+1)))}
team_dict_df = pd.DataFrame.from_dict(team_dict,'index')
team_dict_df.columns = ['team_id']



In [213]:

player_df.drop_duplicates(inplace = True) #Some duplicates in the data
player_df.rename(columns={'element':'player_id'}, inplace=True)
player_df.drop('id',axis=1, inplace = True) #drop useless column

# Add some extra data to player_df from player_raw


merged_df = player_df.merge(player_raw[['element_type','status','team','id']],
                            how = 'left', left_on= 'player_id', right_on = 'id')

#merged_df = merged_df.drop(['']) 


In [214]:
# Do two merges to append home and away team id seperately
team_df = team_df.merge(team_dict_df, how = 'left', left_on = 'HomeTeam', right_index = True)
team_df.rename(columns={'team_id':'home_id'}, inplace=True)

team_df = team_df.merge(team_dict_df, how = 'left', left_on = 'AwayTeam', right_index = True)
team_df.rename(columns={'team_id':'away_id'}, inplace=True)



In [215]:
# Define and select the list of features we want to use from the team match stats

team_features = ['home_id','away_id','FTHG','FTAG','HS','AS','HST','AST','HF','AF','HC','AC','B365H','B365D','B365D']
team_df = team_df[team_features]

In [216]:
# Join match data to merged_df for home and away seperately, then concat them

home_df = merged_df[merged_df.was_home == True].merge(team_df, how = 'left',
                                                      left_on = ['team','opponent_team'], right_on = ['home_id','away_id'])

away_df = merged_df[merged_df.was_home == False].merge(team_df, how = 'left',
                                                      left_on = ['team','opponent_team'], right_on = ['away_id','home_id'])

full_df = pd.concat([home_df,away_df],axis = 0)

# Sort values and set multi-level index
full_df.dropna(inplace = True) # Get some nas from team-join for some reason?
full_df.sort_values(by=['player_id','gw'],inplace=True)
full_df.set_index(['player_id','gw'],inplace=True)

In [377]:
def create_dataset(train_raw, target_raw, window_size):
    '''
    Formats the train_raw data into input sequences of length window_size and with the
    columns of train_raw as features. 
    Output dataX as np.array with shape (n_samples, window_size, n_features),
    dataY are the corresponding targets
    '''
    
    dataX, dataY = [], []    
    
    n_samples = train_raw.shape[0]-window_size
    
    for i in range(n_samples):
            dataX.append(train_raw[i:i+window_size,:])
            dataY.append(target_raw[i+window_size])
    
    dataX = np.array(dataX).reshape((n_samples,window_size,train_raw.shape[1]))
    dataY = np.array(dataY)
    
    return dataX, dataY
 

In [400]:
# Columns we drop from the analysis
drop_columns = ['bps', 'ea_index','fixture',
                'ict_index','kickoff_time','kickoff_time_formatted',
                'loaned_out','loaned_in','opponent_team',
                'round', 'team_a_score', 'team_h_score', 'threat',
                'transfers_balance', 'transfers_in', 'transfers_out',
                'was_home', 'name', 'team', 'id', 'home_id', 'away_id','status']


train_df = full_df.drop(drop_columns,axis=1)

# Note: 
# Dropped columns that might be interesting: 
# transfer_blanace + transfer_in/transfer_out
# status (categorical variable, convert to one-hot maybe?)

In [401]:
window = 10



totX = np.empty((0,window,train_df.shape[1]))
totY = []

for p_id in train_df.index.get_level_values('player_id').unique():
    
    player_df = train_df.loc[p_id]
    if player_df.shape[0] >= window:
        #Only append values if player has enough gameweeks
        player_target = (player_df['total_points'].values/
                         player_df['value'].values)
        
        playerX, playerY = create_dataset(player_df.as_matrix(),
                                          player_target,
                                          window)
        
        totX = np.concatenate((totX,playerX), axis = 0)
        totY = np.concatenate((totY,playerY))



In [415]:

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error






In [408]:
trainX, testX, trainY, testY = train_test_split(totX, totY, test_size = 0.25)

In [420]:
def getLSTM(prob_shape):
    model = Sequential()

    model.add(LSTM(128,activation='tanh',input_shape = prob_shape))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation="linear"))

    optimizer = Adam(lr=0.01, decay=0.0)
    model.compile(loss='logcosh', optimizer=optimizer, metrics=['mae'])
    
    return model

In [None]:
x_shape = (totX.shape[1], totX.shape[2])

model = getLSTM(x_shape)


#model = getMiniModel()
model.summary()

batch_size = 1
earlyStopping = EarlyStopping(monitor='val_loss', patience=8, verbose=0, mode='min')
mcp_save = ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, epsilon=1e-4, mode='min')

model.fit(trainX, trainY, batch_size=batch_size, epochs=100, verbose=1,
          callbacks=[earlyStopping, mcp_save, reduce_lr_loss], validation_split=0.25)
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 128)               91136     
_________________________________________________________________
dense_11 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 257       
Total params: 124,417
Trainable params: 124,417
Non-trainable params: 0
_________________________________________________________________
Train on 8662 samples, validate on 2888 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 9.999999310821295e-05.
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoc

In [422]:
model.predict(testX)

array([[0.02315273],
       [0.02315273],
       [0.02315273],
       ...,
       [0.02315273],
       [0.02315273],
       [0.02315273]], dtype=float32)

In [423]:
np.mean(trainY)

0.025483001379376008