In [2]:
from datetime import datetime, date
# html을 파싱
from bs4 import BeautifulSoup
# http request를 보내고 http response 를 받기 위해 urllib
import urllib.request
# regular expression
import re
import os
import sys
from dateutil.parser import parse
import csv

In [3]:
def CRAWL_DATA():
    URL = 'https://www.soccernews.com/soccer-transfers/'
    DIR_NAME = 'csv/'
    FILE_NAME = ''
    FILE_TYPE = '.csv'
    league_list = ['english-premier-league-transfers', 'spanish-la-liga-transfers', 
                  'italian-serie-a-transfers', 'german-bundesliga-transfers', 'rest-of-europe-transfers']
    for year in range(2018, 2009, -1):
        full_year = '-' + str(year) + '-' + str(year + 1)
        print('* crawl data for season ', year, ' ~ ', year + 1)
        YEAR = year
        for league in league_list:
            print('* crawl data for league : ' + league)
            FILE_NAME = league
            url = URL + league + full_year + '/'
            print('* created url : %s' % url)
            make_working_directory(DIR_NAME)
            res = crawl(url)
            if res is not None:
                res = filter_price(res)
                make_csv(DIR_NAME, FILE_NAME, YEAR, FILE_TYPE, res, league)

In [4]:
def make_working_directory(dir_name):
    if not os.path.exists(dir_name):
        print('* create directory ', dir_name)
        os.mkdir(dir_name)

In [5]:
def crawl(url):
    try:
        source_from_url = urllib.request.urlopen(url)
    except:
        print('* 404 ERROR')
        return None
    print('* crawl from web ' + url)
    # lxml 방식으로 파싱
    soup = BeautifulSoup(source_from_url, 'lxml', from_encoding='utf-8')
    res = []
    # bs4.element.ResultSet, size = 1
    for found in soup.find_all('table', limit=1):
        tmp = found.find_all(text=True)
        filtered = []
        arr = []
        count = 0
        for i in range(len(tmp)):
            item = tmp[i]
            if item != '\n':
                if item != ' ':
                    if item != 'Jan':
                        filtered.append(item)
        # header 제거
        filtered = list(filtered[5:])
        
        for i in range(len(filtered)):
            if count % 6 == 0:
                arr = []
            item = filtered[i]
            if item != '\n':
                if item != ' ':
                    arr.append(item.lower())
                    count = count + 1
            if (count) % 6 == 0:
                if len(arr) is not 0:
                    res.append(arr)
    # print_list(res)
    return res

In [6]:
def filter_price(res):
    data = []
    print('* filter price')
    for i, item in enumerate(res):
        price = item[5]
        price = price.replace('€', '')
        if price == 'undisclosed':
            pass
        elif price == 'loan':
            pass
        elif price == 'free':
            item[5] = 0
        elif price.find('m') != -1:
            try:
                price = price.split('m')[0]
                price = float(price) * 1000000
                item[5] = int(price)
            except:
                print('* error converting : ', item[5])
        data.append(item)
    return data 

In [7]:
def make_csv(dir_name, file_name, year, file_type, res, league):
    full_file_name = dir_name + file_name + '-' + str(year) + file_type
    print('* make file : ' + full_file_name)
    f = open(full_file_name, 'w', encoding='utf-8', newline='')
    wr = csv.writer(f)
    wr.writerow(['transfer date', 'player name', 'position', 'from', 'to','price'])
    for i, transfer in enumerate(res):
        wr.writerow([transfer[0], transfer[1], transfer[2], transfer[3], transfer[4], transfer[5]])
    f.close()
    print('* done writing to %s' % full_file_name)

In [8]:
def print_list(l):
    print('%d items in list' % len(l))
    for i, item in enumerate(l):
        print(i+1, ' > ', item)

In [9]:
CRAWL_DATA()

* crawl data for season  2018  ~  2019
* crawl data for league : english-premier-league-transfers
* created url : https://www.soccernews.com/soccer-transfers/english-premier-league-transfers-2018-2019/
* create directory  csv/
* crawl from web https://www.soccernews.com/soccer-transfers/english-premier-league-transfers-2018-2019/
* filter price
* make file : csv/english-premier-league-transfers-2018.csv
* done writing to csv/english-premier-league-transfers-2018.csv
* crawl data for league : spanish-la-liga-transfers
* created url : https://www.soccernews.com/soccer-transfers/spanish-la-liga-transfers-2018-2019/
* crawl from web https://www.soccernews.com/soccer-transfers/spanish-la-liga-transfers-2018-2019/
* filter price
* make file : csv/spanish-la-liga-transfers-2018.csv
* done writing to csv/spanish-la-liga-transfers-2018.csv
* crawl data for league : italian-serie-a-transfers
* created url : https://www.soccernews.com/soccer-transfers/italian-serie-a-transfers-2018-2019/
* crawl 

* crawl from web https://www.soccernews.com/soccer-transfers/spanish-la-liga-transfers-2014-2015/
* filter price
* make file : csv/spanish-la-liga-transfers-2014.csv
* done writing to csv/spanish-la-liga-transfers-2014.csv
* crawl data for league : italian-serie-a-transfers
* created url : https://www.soccernews.com/soccer-transfers/italian-serie-a-transfers-2014-2015/
* crawl from web https://www.soccernews.com/soccer-transfers/italian-serie-a-transfers-2014-2015/
* filter price
* make file : csv/italian-serie-a-transfers-2014.csv
* done writing to csv/italian-serie-a-transfers-2014.csv
* crawl data for league : german-bundesliga-transfers
* created url : https://www.soccernews.com/soccer-transfers/german-bundesliga-transfers-2014-2015/
* crawl from web https://www.soccernews.com/soccer-transfers/german-bundesliga-transfers-2014-2015/
* filter price
* make file : csv/german-bundesliga-transfers-2014.csv
* done writing to csv/german-bundesliga-transfers-2014.csv
* crawl data for league

* 404 ERROR
* crawl data for league : rest-of-europe-transfers
* created url : https://www.soccernews.com/soccer-transfers/rest-of-europe-transfers-2010-2011/
* crawl from web https://www.soccernews.com/soccer-transfers/rest-of-europe-transfers-2010-2011/
* filter price
* make file : csv/rest-of-europe-transfers-2010.csv
* done writing to csv/rest-of-europe-transfers-2010.csv
