In [None]:
import os
import datetime as dt
import glob
import numpy as np
from bs4 import BeautifulSoup
from lxml import html
import lxml
import re
import time
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait  # available since 2.4.0

def set_download_path(my_path):
    download_path = my_path
    if not os.path.exists(download_path):
        os.makedirs(download_path, exist_ok=True)
    return download_path

def save_source_code(html_source, path, filename):
    source_code = html_source
    f = open(path + '/' + filename, 'wb')
    f.write(source_code.encode('utf-8'))
    f.close()

def find_hrefs_by_regex(bs, reg_exp):
    all_items = bs.find_all("a", href=re.compile(reg_exp))
    return all_items

def create_absolute_urls(base_url, rel_links):
    links = []
    for link in rel_links:
        if 'href' in link.attrs:
            links.append(base_url + link.attrs['href'])
    return list(set(links))

def fix_avg_age(input_str):
    age = str(input_str)
    fixed_age = float(age[:-1] + '.' + age[-1:])
    return fixed_age

data_list = []

# set download path
download_path = set_download_path('/Users/matthewmurray/ds/metis/metisgh/premier_league/transfermarkt/pages/squad_lists')

# load each saved html file and store in bs object
file_path = download_path + '/*.html'
file_list = glob.glob(file_path)


In [None]:
for f in file_list:
    base = os.path.basename(f)
    file_name = os.path.splitext(base)[0]

    with open(f, "r") as f:
        page = f.read()
        bs = BeautifulSoup(page, "lxml")
        bs_tables = bs.find('table', class_='items')
        df = pd.read_html(str(bs_tables))[0]
        
        season_1 = file_name[:4]
        season_2 = int(file_name[2:4])+1
        if season_2 == 100:
            season_2 = '00'
        elif season_2 < 10:
            season_2 = '0' + str(season_2)
        season_name = str(season_1) + '-' + str(season_2)
        
        season1 = file_name[:4]
        team_name = file_name[5:].replace('-', ' ').title()
        
        data = df['Market value'].dropna()
        data = data.map(lambda x: x.replace('-','None'))
        to_drop = ['None']
        data = data[~data.isin(to_drop)]
        df = pd.DataFrame(data)
        df.columns = ['DATE_JOINED']
        season_st_date = dt.datetime(int(season1)+1, 5, 1)
        season_st_date
        df['SEASON_END_DATE'] = season_st_date
        df['DATE_JOINED'] =  pd.to_datetime(df['DATE_JOINED'])
        df['DAYS_AT_TEAM'] = df['SEASON_END_DATE'] - df['DATE_JOINED']
        avg_days = df['DAYS_AT_TEAM'].mean()
        ans = (avg_days / np.timedelta64(1, 'D')).astype(int) / 365
        avg_time = float("%.1f" % ans)

        # create tuple of (team, season, avg_time)
        tup = (team_name, season_name, avg_time)
        data_list.append(tup)

In [None]:
cleaned_df = pd.DataFrame(data_list)

In [None]:
cleaned_df.columns = ['TEAM','SEASON','AVG_TIME']

In [None]:
cleaned_df.to_csv('data/team_continuity.csv', index=None)