# Baseline Model

The notebook consits of three parts.
* Import and install related packages
* Data Perprocessing
* Model Construction

In [1]:
!pip install understat

Collecting understat
  Downloading understat-0.1.7-py3-none-any.whl (10 kB)
Collecting pytest-aiohttp==0.3.0
  Downloading pytest_aiohttp-0.3.0-py3-none-any.whl (3.9 kB)
Collecting pytest-cov==3.0.0
  Downloading pytest_cov-3.0.0-py3-none-any.whl (20 kB)
Collecting pytest-mock==3.6.0
  Downloading pytest_mock-3.6.0-py3-none-any.whl (12 kB)
Collecting aiohttp==3.7.4
  Downloading aiohttp-3.7.4-cp38-cp38-macosx_10_14_x86_64.whl (648 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.3/648.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m31m17.0 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting pytest==6.2.0
  Downloading pytest-6.2.0-py3-none-any.whl (279 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.6/279.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hCollecting beautifulsoup4==4.9.3
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━

## Install Pakcages

In [3]:
!pip install nest-asyncio

Collecting nest-asyncio
  Downloading nest_asyncio-1.5.6-py3-none-any.whl (5.2 kB)
Installing collected packages: nest-asyncio
Successfully installed nest-asyncio-1.5.6


In [67]:
import asyncio
import json
import aiohttp
from understat import Understat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import *
import nest_asyncio
nest_asyncio.apply()
# __import__('IPython').embed()

## Loading and Prepare data from Understat

I will choose my team Chelsea as an example. The prediction match results will use 
1. xG(Predicted)
2. previous record
3. recent state(recent win within 5 games)
4. tactics(formation)
5. Player inside the squad(define by players recent performance(evaluation system))
6. Rank on the table
7. Coach and other
   
<hr>

While xG expectation will be predicted by
1. Previous goals against the team
2. Recent Goals(five games)
3. Player
4. Game Stats(recent five game pocession and shots)

### xG data prapartion

* accquire all data related to match such as result, time, goals
* find the game statistics of the particular game(formation, shots, miss shots)
* obtain the recent statistics and club state
* num of players in top10 goal and assist rank

In [69]:
# Define some variable
team = 'Chelsea'
stats_interval = 5

#### Data Function

In [74]:
def get_match_result(team, season):
    async def main():
        async with aiohttp.ClientSession() as session:
            understat = Understat(session)
            team_stats = await understat.get_team_results(
                team,
                season
            )
            # return json.dumps(team_stats)
            return team_stats


    loop = asyncio.get_event_loop()
    data = loop.run_until_complete(main())
    return data

def get_playershot_data(match_id):
    async def main():
        async with aiohttp.ClientSession() as session:
            understat = Understat(session)
            players = await understat.get_match_shots(match_id)
            return players
    loop = asyncio.get_event_loop()
    players = loop.run_until_complete(main())
    return players

def get_player_data(match_id):
    async def main():
        async with aiohttp.ClientSession() as session:
            understat = Understat(session)
            players = await understat.get_match_players(match_id)
            return players

    loop = asyncio.get_event_loop()
    players = loop.run_until_complete(main())
    return players

def get_goal_rank(league, season, find_team, team = None):
    async def main():
        if find_team:

            async with aiohttp.ClientSession() as session:
                understat = Understat(session)
                players = await understat.get_league_players(
                    league,
                    season,
                    team_title = team
                )

               
        else:
            async with aiohttp.ClientSession() as session:
                understat = Understat(session)
                players = await understat.get_league_players(
                    league,
                    season
                )

        return players

    loop = asyncio.get_event_loop()
    rank = loop.run_until_complete(main())
    return rank



#### Data Processing Functions

In [105]:
def Constrcut_df_from_results(li):
    '''
    Parameter: li - List

    Return DataFrame

    Id - Game oid
    oppoenet - team name
    opponent_id - id of team name
    home - binary for whether in home(0:home, 1:away)
    result - win/draw/loss
    datatime - Game time
    xG - xG of chelsea
    xG_opponent - Xg of oppoenent
    '''

    col_name = 'id', 'oppenet', 'opponent_id', 'home', 'result', 'datetime', 'xG', 'num_of_score_goals', 'num_of_conced_goals'
    id, oppenet, opponent_id, home, result, datetime, xG,goal_score, goal_conced = [], [], [], [], [], [], [], [], []
    for game in li:
        id.append(game['id'])
        opo_side = 'a' if game['side'] == 'h' else 'h'
        oppenet.append(game[opo_side]['title'])
        opponent_id.append(game[opo_side]['id'])
        home.append(0 if game['side'] == 'h' else 1)
        result.append(game['result'])
        xG.append(game['xG'][game['side']])
        datetime.append(game['datetime'])
        goal_score.append(game['goals'][game['side']])
        goal_conced.append(game['goals'][opo_side])
    data_list = [id, oppenet, opponent_id, home, result, datetime, xG,goal_score, goal_conced]
    df = pd.DataFrame({col_name: col for col_name, col in zip(col_name, data_list)})
    return df

def num_of_shots(id, home):
    '''
    Parmeter:
    id - Game id
    home - whether the game is home or away
    players - player shot data 

    Return the number of shots and shots of opponent

    '''
    status = 'h' if home == 0 else 'a'
    players = get_playershot_data(id)
    return len(players[status])

# after merge data 
def get_stats_dict(league, season, type, n,find_team, team = None):
    data = get_goal_rank(league, season, find_team, team)
    data_dict = {x['player_name']: x[type] for x in data}
    sorted_data = sorted(data_dict.keys(), key= lambda x: int(data_dict[x]), reverse=True)[:n]
    return sorted_data
    
    
def get_player_stats(id_data, season):
    
    col_names = ['id', 'datetime', 'home', 'name', 'goal', 'assit', 'shot', 'rank_goal', 'rank_team_goal', 'rank_assist', 'rank_team_assist']
    res_df = pd.DataFrame(columns=col_names)
    for i in range(id_data.shape[0]):
        print(id_data.loc[i, 'id'])
        status = 'h' if id_data.loc[i, 'home'] == 0 else 'a'
        player_data = get_player_data(id_data.loc[i, 'id'])[status]
        match_data = get_playershot_data(id_data.loc[i, 'id'])[status]
        team = list(map(lambda x: player_data[x]['player'], player_data.keys()))
        df = pd.DataFrame(np.zeros((len(team),len(col_names))), columns=col_names)
        df['id'] = id_data.loc[i, 'id']
        df['status'] = status
        df['name'] = team

        for shot in match_data:
            df.loc[df['name'] == shot['player'], 'shot'] += 1
            if shot['result'] == 'Goal':
                df.loc[df['name'] == shot['player'], 'goal'] += 1
                df.loc[df['name'] == shot['player_assisted'],'assit'] += 1
            else:
                pass
        datetime  =  id_data.loc[i, 'datetime']
        df['datetime'] = datetime
        # start to fill the player columns
        # r
        league_goal_rank = get_stats_dict('epl', season, 'goals', 15, False)
        league_assit_rank = get_stats_dict('epl', season, 'assists', 15, False)
        team_goal_rank = get_stats_dict('epl', season, 'goals', 5, True, team)
        team_assit_rank = get_stats_dict('epl', season, 'goals', 5, True, team)
        
        def player_upgrade(x, list):
            return x+1 if x in list else x

        for col, li in zip(['rank_goal', 'rank_team_goal', 'rank_assist', 'rank_team_assist'], [league_goal_rank, league_assit_rank, 
           team_assit_rank, team_goal_rank]):
            df[col] = df[col].apply(lambda x: player_upgrade(x, li))

        res_df = pd.concat([res_df, df], axis = 0)

    return res_df



    
    




#### Example

In [34]:
data = get_match_result(team, 2018)

In [38]:
df = Constrcut_df_from_results(data)

In [41]:
df.head()

Unnamed: 0,id,oppenet,opponent_id,home,result,datetime,xG,num_of_score_goals,num_of_conced_goals
0,9200,Huddersfield,219,1,w,2018-08-11 17:00:00,2.31427,3,0
1,9212,Arsenal,83,0,w,2018-08-18 19:30:00,1.71799,3,2
2,9225,Newcastle United,86,1,w,2018-08-26 18:00:00,1.49669,2,1
3,9231,Bournemouth,73,0,w,2018-09-01 14:00:00,1.16408,2,0
4,9241,Cardiff,227,0,w,2018-09-15 14:00:00,2.67145,4,1


In [48]:
df['id']

0     9200
1     9212
2     9225
3     9231
4     9241
5     9255
6     9264
7     9275
8     9277
9     9295
10    9305
11    9314
12    9323
13    9334
14    9339
15    9352
16    9366
17    9373
18    9377
19    9394
20    9403
21    9413
22    9426
23    9430
24    9443
25    9454
26    9474
27    9482
28    9494
29    9503
30    9514
31    9462
32    9525
33    9531
34    9543
35    9550
36    9563
37    9572
Name: id, dtype: object

In [49]:
df['shots_attempt'] = df.apply(lambda x: num_of_shots(x['id'], x['home']), axis = 1)

In [52]:
df

Unnamed: 0,id,oppenet,opponent_id,home,result,datetime,xG,num_of_score_goals,num_of_conced_goals,shots_attempt
0,9200,Huddersfield,219,1,w,2018-08-11 17:00:00,2.31427,3,0,13
1,9212,Arsenal,83,0,w,2018-08-18 19:30:00,1.71799,3,2,24
2,9225,Newcastle United,86,1,w,2018-08-26 18:00:00,1.49669,2,1,15
3,9231,Bournemouth,73,0,w,2018-09-01 14:00:00,1.16408,2,0,24
4,9241,Cardiff,227,0,w,2018-09-15 14:00:00,2.67145,4,1,18
5,9255,West Ham,81,1,d,2018-09-23 12:30:00,1.88344,0,0,17
6,9264,Liverpool,87,0,d,2018-09-29 16:30:00,1.77638,1,1,10
7,9275,Southampton,74,1,w,2018-10-07 13:15:00,3.54699,3,0,21
8,9277,Manchester United,89,0,d,2018-10-20 11:30:00,2.21163,2,2,21
9,9295,Burnley,92,1,w,2018-10-28 12:30:00,3.01621,4,0,24


In [53]:
df.to_csv('../../../data/Chelsea_2018.csv', index = None)

In [106]:
player_df = get_player_stats(df, season=2017)

9200
9212
9225
9231
9241
9255
9264
9275
9277
9295
9305
9314
9323
9334
9339
9352
9366
9373
9377
9394
9403
9413
9426
9430
9443
9454
9474
9482
9494
9503
9514
9462
9525
9531
9543
9550
9563
9572


In [59]:
rank = get_goal_rank('epl', 2017)
rank

[{'id': '1250',
  'player_name': 'Mohamed Salah',
  'games': '36',
  'time': '2954',
  'goals': '32',
  'xG': '25.136502970010042',
  'assists': '10',
  'xA': '8.344477602280676',
  'shots': '143',
  'key_passes': '62',
  'yellow_cards': '1',
  'red_cards': '0',
  'position': 'F M S',
  'team_title': 'Liverpool',
  'npg': '31',
  'npxG': '23.6141653098166',
  'xGChain': '35.301465447992086',
  'xGBuildup': '5.898578152060509'},
 {'id': '647',
  'player_name': 'Harry Kane',
  'games': '37',
  'time': '3094',
  'goals': '30',
  'xG': '26.859890587627888',
  'assists': '2',
  'xA': '3.8204412199556828',
  'shots': '183',
  'key_passes': '34',
  'yellow_cards': '5',
  'red_cards': '0',
  'position': 'F S',
  'team_title': 'Tottenham',
  'npg': '28',
  'npxG': '24.576384104788303',
  'xGChain': '28.51526607386768',
  'xGBuildup': '7.9616343677043915'},
 {'id': '619',
  'player_name': 'Sergio Agüero',
  'games': '25',
  'time': '1985',
  'goals': '21',
  'xG': '18.56861797720194',
  'assists

In [60]:
!jupyter nbconvert --to html baseline.ipynb  

[NbConvertApp] Converting notebook baseline.ipynb to html
[NbConvertApp] Writing 352634 bytes to baseline.html


In [103]:
player_df.loc[:, 'id'].unique().shape

(38,)

In [110]:
player_df.to_csv('../../../data/Chelsea_player_2018.csv', index = None)