In [1]:
import requests
import pandas as pd
import numpy as np
from msal import ConfidentialClientApplication
from dotenv import load_dotenv
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Steps to perform data cleaning
1. Get the players dimension data from the Player API
2. Get the team data from the Team API for connecting to player data via team code, clean the data to get only the team name and the team code
3. Get the player position dimension  from elements type API 
4. Loop through the player history data API endpoints, edit the data type of the date column and transform to YearMonth, YearQuarter

### 1. Get the players dim data

In [3]:
url = 'https://fantasy.premierleague.com/api/bootstrap-static/'
r = requests.get(url)
json = r.json()
json.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [4]:
elements_df = pd.DataFrame(json['elements'])

In [5]:
elements_df = elements_df[['code',
       'first_name', 'form', 'id', 'now_cost', 'points_per_game',
       'second_name', 'team', 'team_code', 'total_points', 'web_name', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'saves', 'influence', 'creativity', 'threat', 'element_type'
       ]]

In [6]:
elements_df['Full_name'] = elements_df[['first_name','second_name']].agg(' '.join, axis=1)

### 2. Get the team data for connecting to player data via team code, get only the team name and the team code

In [7]:
teams_df = pd.DataFrame(json['teams'])

In [8]:
teams_df = teams_df[['id', 'name', 'short_name']]

### 3. Get the player position data from elements type API

In [9]:
element_types_df = pd.DataFrame(json['element_types'])

### 4. Loop through the player history data API using while true break, edit the data type of the date column and transform to YearMonth, YearQuarter

In [10]:
# Get the max id from the elements ID and generate a series of number from 1 to that max number to iterate the for loop through that API:
max_player_id = elements_df['id'].max()
max_player_id

770

In [11]:
all_player_history_df = pd.DataFrame()
for i in range(1,max_player_id):
    try:
        url = 'https://fantasy.premierleague.com/api/element-summary/' + str(i)
        r = requests.get(url)
        json = r.json()
        json.keys()
        history_df = pd.DataFrame(json['history'])
        all_player_history_df = pd.concat([all_player_history_df, history_df], ignore_index=True)
    except:
        pass
n = len(pd.unique(all_player_history_df['element']))
print("No.of.unique player id:", 
      n)

No.of.unique player id: 767


In [None]:
all_player_history_df_processed = all_player_history_df.copy()

In [None]:
all_player_history_df_processed['kickoff_time']=all_player_history_df_processed['kickoff_time'].str[0:10]

In [None]:
all_player_history_df_processed['kickoff_time'] = pd.to_datetime(all_player_history_df_processed['kickoff_time'])
all_player_history_df_processed['game_month'] = all_player_history_df_processed['kickoff_time'].dt.strftime('%Y%m')
all_player_history_df_processed['game_quarter'] = all_player_history_df_processed['kickoff_time'].dt.strftime('%YQ%q')

In [None]:
all_player_history_df_processed = all_player_history_df_processed[['element', 'fixture', 'kickoff_time', 'round', 'minutes',
       'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'influence', 'creativity',
       'threat', 'game_month', 'game_quarter']]

### Export all dataframes to Onedrive for visualization in PowerBI


In [None]:
dfs = {'dim_player_details': elements_df,
        'dim_team': teams_df,
       'dim_position': element_types_df,
       'fact_player_history': all_player_history_df_processed
       }

In [None]:
load_dotenv()
client_id = os.getenv('client_id')
client_secret = os.getenv('client_secret')
tenant_id = os.getenv('tenant_id')
token_endpoint = f'https://login.microsoftonline.com/{tenant_id}/oauth2/token'

for key in dfs:
    graph_api_endpoint = f'https://graph.microsoft.com/v1.0/me/drive/root:/Premier League Football API data/{key}.csv:/content'
    csv_data = dfs[key].to_csv(index=False)

    # Acquire token
    app = ConfidentialClientApplication(
        client_id,
        authority=f'https://login.microsoftonline.com/{tenant_id}',
        client_credential=client_secret,
    )
    token_response = app.acquire_token_for_client(scopes=['https://graph.microsoft.com/.default'])

    # Upload to OneDrive
    headers = {'Authorization': 'Bearer ' + token_response['access_token']}
    response = requests.put(graph_api_endpoint, headers=headers, data=csv_data)

    if response.status_code == 201:
        print('File uploaded successfully!')
    else:
        print('Error uploading file. Status code:', response.status_code)
        print(response.text)

### 5. List of Pending items
- Remove redundant columns from all_player_history_df - done
- Change fixture date to time series and add new time columns - done
- Export all columns to csv and upload to personal onedrive for connection to PowerBI get data from web - done