In [1]:
import datetime
import requests
from scrapy.http import TextResponse

In [2]:
import re

def text_cleaner(text):
    return [item.strip() for item in text if not re.match('^(\r\n|\s)+$', item)]

In [3]:
def money(value):
    
    quantity, multiplier = re.search('([\d\.]+)(M|K)?', value).groups()
    quantity = float(quantity)
    
    if multiplier:
        multiplier = {'M':1000000, 'K':1000}[multiplier]
    else:
        multiplier = 1
    
    return int(quantity * multiplier)

## Teams


In [4]:
#r = requests.get('https://sofifa.com/team/112791')
#response = TextResponse(r.url, body=r.text, encoding='utf-8')

In [5]:
#team = {}
#team_basic(response, team)
#team_stats(response, team)
#team_lineup(response, team)
#team_info(response, team)
#team

In [6]:
def team_basic(response, team):
    raw   = response.css('div.info').xpath('.//text()').extract()
    clean = text_cleaner(raw)

    team['Name'], team['Team ID'] = re.match('([\w\s\.\-\'\(\)\&]+) \(ID: (\d+)\)', clean[0]).groups()
    team['League'] = clean[1]
    team['Nationality'] = response.css('div.info').xpath('.//a[contains(@href, "teams?na=")]/@title').extract_first()

In [7]:
import operator
    
def team_lineup(response, team):
    lineup = []

    for position in response.xpath('.//div[@class="field-player"]'):
        pos = {}
 
        y, x = position.xpath('./@style').re('top:([\d\.\-]+)px;left:([\d\.\-]+)px;')
        
        pos['Player ID']   = position.xpath('./a/@href').re_first('\d+')
        pos['y'], pos['x'] = float(y), float(x)
        
        rows = position.xpath('.//span[@class="nowrap row"]')

        if len(rows) == 0:
            pos['Position'] = 'GK'
        else:
            pos['Instructions'] = {}

            for row in rows:
                instruction, value = text_cleaner(row.xpath('.//text()').extract())

                if re.search('instructions', value, re.IGNORECASE):
                    pos['Position'] = instruction
                else:
                    pos['Instructions'][instruction] = value.lower()

        lineup.append(pos)
    
    lineup = sorted(lineup, key=operator.itemgetter('y', 'x'), reverse=True)
    
    team['Lineup'] = {'%02d' % (key + 1): value for (key, value) in enumerate(lineup)}

In [8]:
def team_stats(response, team):
    extract = lambda stat: response.xpath('//span[contains(.., $concept)]/text()', concept=stat).extract_first()

    stats = ['Overall', 'Attack', 'Midfield', 'Defence']

    for stat in stats:
        team[stat] = int(extract(stat))

In [9]:
def team_info(response, team):
     
    raw = response.xpath('//li[contains(label, "Home stadium")]/text()').extract()
    team['Home stadium'] = text_cleaner(raw)[0]
    
    raw = response.xpath('//li[contains(label, "Transfer budget")]/text()').extract()
    
    if raw:
        team['Transfer budget EUR'] = money(text_cleaner(raw)[0])

    for concept in ['Starting 11 average age', 'Whole team average age']:
        raw = response.xpath('//li[contains(label, $concept)]/text()', concept=concept).extract()
        team[concept] = float(text_cleaner(raw)[0])
        
    for concept in ['International prestige', 'Domestic prestige']:
        raw = response.xpath('//li[contains(label, $concept)]/span/text()', concept=concept).extract_first()
        if raw:
            team[concept] = int(raw)
        
    for concept in ['Rival team', 'Captain', 'Short free kick', 'Long free kick', 'Penalties', 'Left corner', 'Right corner']:
        team[concept + ' ID'] = response.xpath('//li[contains(label, $concept)]/a/@href', concept=concept).re_first('\d+')

## Players

In [10]:
#r = requests.get('https://sofifa.com/player/190871')
#response = TextResponse(r.url, body=r.text, encoding='utf-8')

In [11]:
#player = {}
#basic(response, player)
#stats(response, player)
#body(response, player)
#composed_attributes(response, player)
#teams(response, player)
#attributes(response, player)
#traits_and_specialities(response, player)
#player

### Header — Basic info

In [12]:
def basic(response, player):
    
    # 'Known as'
    # 'Player ID'
    # 'Name'
    # 'Nationality' 
    # 'Positions'
    # 'Birth date'
    # 'Height [cm]'
    # 'Weight [kg]'

    raw   = response.css('div.info').xpath('.//text()').extract()
    clean = text_cleaner(raw)

    player['Known as'], player['Player ID'] = re.match('([\w\s\.\-\']+) \(ID: (\d+)\)', clean[0]).groups()
    player['Name'] = clean[1]
    player['Positions'] = {}
    
    for position in clean[2:-1]:
        player['Positions'][position] = True

    match = re.match('Age (\d+) \((\w{3} \d{1,2}, \d{4})\) (\d{3})cm (\d{2,3})kg', clean[-1]) 
    
    player['Age']        = int(match.group(1))
    player['Birth date'] = datetime.datetime.strptime(match.group(2), '%b %d, %Y').strftime('%Y-%m-%d')
    player['Height cm']  = int(match.group(3))
    player['Weight kg']  = int(match.group(4))
    
    player['Nationality'] = response.css('div.info').xpath('.//a[contains(@href, "players?na=")]/@title').extract_first()

### Subheader — Stats

In [13]:
def stats(response, player):
    
    # 'Overall rating'
    # 'Potential'
    # 'Value'
    # 'Wage'
    
    extract = lambda stat: response.xpath('//span[contains(.., $concept)]/text()', concept=stat).extract_first()
       
    player['Overall rating'] = int(  extract('Overall rating'))
    player['Potential']      = int(  extract('Potential'))
    player['Value EUR']        = money(extract('Value'))
    player['Wage EUR']         = money(extract('Wage'))

### First column — Body

In [14]:
def body(response, player):
    
    extract = lambda concept: response.xpath('//li[contains(., $concept)]', concept=concept).xpath('(./span|.)/text()').extract()
        
    for concept in ['International reputation', 'Weak foot', 'Skill moves']:
        raw = extract(concept)
        
        if len(raw) == 0: continue
            
        player[concept] = int(text_cleaner(raw)[0])
    
    
    for concept in ['Body type', 'Preferred foot']:
        raw = extract(concept)
        
        if len(raw) == 0: continue
            
        player[concept] = text_cleaner(raw)[0]
        
        
    raw = extract('Work rate')
    
    if len(raw) > 0:
        work_rate = {}
        work_rate['Attacking'], work_rate['Defensive'] = text_cleaner(raw)[0].split(' / ')
        player['Work rate'] = work_rate
    
    
    raw = extract('Release clause')
    
    if len(raw) > 0:
        player['Release clause EUR'] = money(text_cleaner(raw)[0])

### Second column — Composed attributes

In [15]:
def composed_attributes(response, player):
    raw = response.xpath('//script[contains(text(), "PHY")]/text()').extract_first()
    
    if not raw: return
    
    player['Composed attributes'] = {}
    
    for attribute, value in re.findall('point(\w{3}) = (\d{1,3})', raw):
        player['Composed attributes'][attribute] = int(value)

### Third and fourth columns — Teams

In [16]:
def teams(response, player):
    
    concepts = [
        'Position',
        'Jersey number',
        'Joined',
        'Loaned from',
        'Contract valid until',
    ]
    
    player['Teams'] = {}
    
    for team_column in response.xpath('//a[contains(@href, "team/")]/ancestor::ul'):

        data = {}
        data['Team ID'] = team_column.xpath('.//@href[contains(., "team/")]').re_first('\d+')

        for concept in concepts:
            raw = team_column.xpath('.//li[contains(label, $concept)]', concept=concept) \
                  .xpath('./span | .').xpath('./text() | a/@href').extract()

            if len(raw) == 0: continue

            clean = text_cleaner(raw)[0]

            if   concept == 'Joined':
                data[concept] = datetime.datetime.strptime(clean, '%b %d, %Y').strftime('%Y-%m-%d')
            elif concept == 'Loaned from':
                data[concept] = re.search('\d+', clean).group(0)
            elif concept == 'Contract valid until':
                # this is because in a few cases the date is like "Dec 31, 2017" instead of "2017"
                data[concept] = clean[-4:]
            else:
                data[concept] = clean
            
        if 'Contract valid until' in data: # team type is "club"
            player['Teams']['Club'] = data
        else: # team type is "national"
            player['Teams']['National'] = data

### 5th to 7th columns — Attributes

In [17]:
def attributes(response, player):

    attributes = {
      'Attacking': [
        'Crossing',
        'Finishing',
        'Heading accuracy',
        'Short passing',
        'Volleys',
      ],
      'Skill': [
         'Dribbling',
         'Curve',
         'Free kick accuracy',
         'Long passing',
         'Ball control',
      ],
      'Movement': [
        'Acceleration',
        'Sprint speed',
        'Agility',
        'Reactions',
        'Balance',
      ],
      'Power': [
        'Shot power',
        'Jumping',
        'Stamina',
        'Strength',
        'Long shots',    
      ],
      'Mentality': [
        'Aggression',
        'Interceptions',
        'Positioning',
        'Vision',
        'Penalties',
        'Composure',
      ],
      'Defending': [
        'Marking',
        'Standing tackle',
        'Sliding tackle',
      ],
      'Goalkeeping': [
        'GK diving',
        'GK handling',
        'GK kicking',
        'GK positioning',
        'GK reflexes',
      ],
    }
    
    player['Attributes'] = {}

    for group in attributes:
        player['Attributes'][group] = {}
        
        for attribute in attributes[group]:
            player['Attributes'][group][attribute] = \
                int(response.xpath('//span[contains(.., $concept)]/text()', concept=attribute).extract_first())

### 8th column — Traits and specialities

In [18]:
def traits_and_specialities(response, player):

    for group in ['Traits', 'Specialities']:
        skills = response.xpath('//h5[text()=$concept]/following::ul[1]/li/text()', concept=group).extract()
        
        if len(skills) == 0: continue
        
        player[group] = {}
        
        for skill in skills:
            
            player[group][skill.strip()] = True

## Crawler

In [19]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from scrapy.crawler import CrawlerProcess

import logging
import datetime
import scrapy

In [20]:
PLAYERS_FILE = 'players_158904.json'
TEAMS_FILE = 'teams_158904.json'

In [21]:
class PlayerSpider(scrapy.Spider):
    name = 'Player'
    
    start_urls = [
        'https://sofifa.com/players?units=mks&currency=EUR',
        'https://sofifa.com/players/free?units=mks&currency=EUR'
    ]
    
    custom_settings = {
        'LOG_LEVEL':   logging.WARNING,
        'FEED_FORMAT': 'json',
        'FEED_URI':    PLAYERS_FILE
    }
    
    def parse(self, response):
        # follow links to player pages
        for href in response.xpath('//a[contains(@href, "player/")]/@href'):
            yield response.follow(href, self.parse_player)

        # follow pagination links
        for href in response.xpath('//a[contains(@href, "offset=") and contains(text(), "Next")]/@href'):
            yield response.follow(href, self.parse)      

            
    def parse_player(self, response):
        player = {}

        basic(response, player)
        stats(response, player)
        body(response, player)
        teams(response, player)
        attributes(response, player)
        traits_and_specialities(response, player)

        yield player

In [22]:
class TeamSpider(scrapy.Spider):
    name = 'Team'
    
    start_urls = [
        'https://sofifa.com/teams/national?units=mks&currency=EUR',
        'https://sofifa.com/teams/club?units=mks&currency=EUR'
    ]
    
    custom_settings = {
        'LOG_LEVEL':   logging.WARNING,
        'FEED_FORMAT': 'json',
        'FEED_URI':    TEAMS_FILE
    }
    
    def parse(self, response):
        # follow links to player pages
        for href in response.xpath('//a[contains(@href, "team/")]/@href'):
            yield response.follow(href, self.parse_team)

        # follow pagination links
        for href in response.xpath('//a[contains(@href, "offset=") and contains(text(), "Next")]/@href'):
            yield response.follow(href, self.parse)      

            
    def parse_team(self, response):
        team = {}

        team_basic(response, team)
        team_stats(response, team)
        team_lineup(response, team)
        team_info(response, team)

        yield team

In [23]:
import os

#if os.path.exists(PLAYERS_FILE):
#    os.remove(PLAYERS_FILE)

if os.path.exists(TEAMS_FILE):
    os.remove(TEAMS_FILE)

In [24]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(TeamSpider)
process.start()

2017-11-14 18:08:33 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-11-14 18:08:33 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


<Deferred at 0x7ff118ad9518>

In [25]:
import pandas as pd
dfjson = pd.read_json(TEAMS_FILE)
dfjson

Unnamed: 0,Attack,Captain ID,Defence,Domestic prestige,Home stadium,International prestige,League,Left corner ID,Lineup,Long free kick ID,...,Nationality,Overall,Penalties ID,Right corner ID,Rival team ID,Short free kick ID,Starting 11 average age,Team ID,Transfer budget EUR,Whole team average age
0,84,155862,86,,Santiago Bernabéu,9,International,193747.0,"{'03': {'Player ID': '155862', 'x': 109.5, 'y'...",193747.0,...,Spain,86,168542,168542.0,1343,193747.0,27.82,1362,,27.61
1,76,187489,75,,Sanderson Park (FIFA 13 NEW generic),5,International,201884.0,"{'03': {'Player ID': '183129', 'x': 109.5, 'y'...",201884.0,...,Republic of Ireland,75,201884,201884.0,1318,201884.0,27.27,1355,,29.17
2,82,163824,76,,Ivy Lane,4,International,186561.0,"{'03': {'Player ID': '163824', 'x': 183.0, 'y'...",186561.0,...,Wales,75,173731,186561.0,1318,173731.0,27.82,1367,,26.48
3,77,148119,75,,Waldstadion (Fussballstadion),6,International,147621.0,"{'03': {'Player ID': '229773', 'x': 183.0, 'y'...",187366.0,...,Russia,76,187491,147621.0,1353,147621.0,27.91,1357,,28.00
4,75,155887,74,,CenturyLink Field,6,International,227796.0,"{'03': {'Player ID': '204082', 'x': 109.5, 'y'...",155887.0,...,United States,75,176237,227796.0,1386,176237.0,26.91,1387,,27.48
5,75,188768,77,,Aloha Park (Generic Oceania 2),6,International,215798.0,"{'03': {'Player ID': '221641', 'x': 109.5, 'y'...",188768.0,...,Ivory Coast,76,197853,226060.0,1395,188768.0,23.82,111112,,24.96
6,81,181459,79,,Stadion Neder,6,International,203980.0,"{'03': {'Player ID': '192774', 'x': 109.5, 'y'...",202151.0,...,Greece,77,183483,203980.0,1365,203980.0,26.82,1338,,26.65
7,75,179645,75,,Waldstadion (Fussballstadion),5,International,190460.0,"{'03': {'Player ID': '202849', 'x': 109.5, 'y'...",190460.0,...,Denmark,78,201943,190460.0,1363,190460.0,26.27,1331,,26.04
8,78,189606,77,,Stadion Olympik,4,International,197445.0,"{'03': {'Player ID': '201922', 'x': 93.75, 'y'...",197445.0,...,Austria,77,184200,197445.0,1337,197445.0,26.09,1322,,25.35
9,78,171897,75,,Estadio Azteca,7,International,226045.0,"{'03': {'Player ID': '173432', 'x': 272.25, 'y...",171897.0,...,Mexico,78,178224,226045.0,1387,171897.0,28.27,1386,,27.17
