In [1]:
import datetime
import requests
from scrapy.http import TextResponse

In [2]:
import re

def text_cleaner(text):
    return [item.strip() for item in text if not re.match('^(\r\n|\s)+$', item)]

In [3]:
def money(value):
    
    quantity, multiplier = re.search('([\d\.]+)(M|K)?', value).groups()
    quantity = float(quantity)
    
    if multiplier:
        multiplier = {'M':1000000, 'K':1000}[multiplier]
    else:
        multiplier = 1
    
    return int(quantity * multiplier)

## Teams


In [4]:
#r = requests.get('https://sofifa.com/team/112791')
#response = TextResponse(r.url, body=r.text, encoding='utf-8')

In [5]:
#team = {}
#team_basic(response, team)
#team_stats(response, team)
#team_lineup(response, team)
#team_info(response, team)
#team

In [6]:
def team_basic(response, team):
    raw   = response.css('div.info').xpath('.//text()').extract()
    clean = text_cleaner(raw)

    team['Name'], team['Team ID'] = re.match('([\w\s\.\-\'\(\)\&]+) \(ID: (\d+)\)', clean[0]).groups()
    team['League'] = clean[1]
    team['Nationality'] = response.css('div.info').xpath('.//a[contains(@href, "teams?na=")]/@title').extract_first()

In [7]:
def team_lineup(response, team):
    team['Lineup'] = []

    for position in response.xpath('.//div[@class="field-player"]'):
        pos = {}
 
        y, x = position.xpath('./@style').re('top:([\d\.\-]+)px;left:([\d\.\-]+)px;')
        
        pos['Player ID']   = position.xpath('./a/@href').re_first('\d+')
        pos['y'], pos['x'] = float(y), float(x)
        
        rows = position.xpath('.//span[@class="nowrap row"]')

        if len(rows) == 0:
            pos['Position'] = 'GK'
        else:
            pos['Instructions'] = {}

            for row in rows:
                instruction, value = text_cleaner(row.xpath('.//text()').extract())

                if re.search('instructions', value, re.IGNORECASE):
                    pos['Position'] = instruction
                else:
                    pos['Instructions'][instruction] = value.lower()

        team['Lineup'].append(pos)

In [8]:
def team_stats(response, team):
    extract = lambda stat: response.xpath('//span[contains(.., $concept)]/text()', concept=stat).extract_first()

    stats = ['Overall', 'Attack', 'Midfield', 'Defence']

    for stat in stats:
        team[stat] = int(extract(stat))

In [9]:
def team_info(response, team):
     
    raw = response.xpath('//li[contains(label, "Home stadium")]/text()').extract()
    team['Home stadium'] = text_cleaner(raw)[0]
    
    raw = response.xpath('//li[contains(label, "Transfer budget")]/text()').extract()
    
    if raw:
        team['Transfer budget €'] = money(text_cleaner(raw)[0])

    for concept in ['Starting 11 average age', 'Whole team average age']:
        raw = response.xpath('//li[contains(label, $concept)]/text()', concept=concept).extract()
        team[concept] = float(text_cleaner(raw)[0])
        
    for concept in ['International prestige', 'Domestic prestige']:
        raw = response.xpath('//li[contains(label, $concept)]/span/text()', concept=concept).extract_first()
        if raw:
            team[concept] = int(raw)
        
    for concept in ['Rival team', 'Captain', 'Short free kick', 'Long free kick', 'Penalties', 'Left corner', 'Right corner']:
        team[concept + ' ID'] = response.xpath('//li[contains(label, $concept)]/a/@href', concept=concept).re_first('\d+')

## Players

In [10]:
#r = requests.get('https://sofifa.com/player/232171')
#response = TextResponse(r.url, body=r.text, encoding='utf-8')

In [11]:
#player = {}
#basic(response, player)
#stats(response, player)
#body(response, player)
#composed_attributes(response, player)
#teams(response, player)
#attributes(response, player)
#traits_and_specialities(response, player)
#player

### Header — Basic info

In [12]:
def basic(response, player):
    
    # 'Known as'
    # 'Player ID'
    # 'Name'
    # 'Nationality' 
    # 'Positions'
    # 'Birth date'
    # 'Height [cm]'
    # 'Weight [kg]'

    raw   = response.css('div.info').xpath('.//text()').extract()
    clean = text_cleaner(raw)

    player['Known as'], player['Player ID'] = re.match('([\w\s\.\-\']+) \(ID: (\d+)\)', clean[0]).groups()
    player['Name'] = clean[1]
    player['Positions'] = {}
    
    for position in clean[2:-1]:
        player['Positions'][position] = True

    match = re.match('Age (\d+) \((\w{3} \d{1,2}, \d{4})\) (\d{3})cm (\d{2,3})kg', clean[-1]) 
    
    player['Age']        = int(match.group(1))
    player['Birth date'] = datetime.datetime.strptime(match.group(2), '%b %d, %Y').strftime('%Y-%m-%d')
    player['Height cm']  = int(match.group(3))
    player['Weight kg']  = int(match.group(4))
    
    player['Nationality'] = response.css('div.info').xpath('.//a[contains(@href, "players?na=")]/@title').extract_first()

### Subheader — Stats

In [13]:
def stats(response, player):
    
    # 'Overall rating'
    # 'Potential'
    # 'Value'
    # 'Wage'
    
    extract = lambda stat: response.xpath('//span[contains(.., $concept)]/text()', concept=stat).extract_first()
       
    player['Overall rating'] = int(  extract('Overall rating'))
    player['Potential']      = int(  extract('Potential'))
    player['Value €']        = money(extract('Value'))
    player['Wage €']         = money(extract('Wage'))

### First column — Body

In [14]:
def body(response, player):
    
    extract = lambda concept: response.xpath('//li[contains(., $concept)]', concept=concept).xpath('(./span|.)/text()').extract()
        
    for concept in ['International reputation', 'Weak foot', 'Skill moves']:
        raw = extract(concept)
        
        if len(raw) == 0: continue
            
        player[concept] = int(text_cleaner(raw)[0])
    
    
    for concept in ['Body type', 'Preferred foot']:
        raw = extract(concept)
        
        if len(raw) == 0: continue
            
        player[concept] = text_cleaner(raw)[0]
        
        
    raw = extract('Work rate')
    
    if len(raw) > 0:
        work_rate = {}
        work_rate['Attacking'], work_rate['Defensive'] = text_cleaner(raw)[0].split(' / ')
        player['Work rate'] = work_rate
    
    
    raw = extract('Release clause')
    
    if len(raw) > 0:
        player['Release clause €'] = money(text_cleaner(raw)[0])

### Second column — Composed attributes

In [15]:
def composed_attributes(response, player):
    raw = response.xpath('//script[contains(text(), "PHY")]/text()').extract_first()
    
    if not raw: return
    
    player['Composed attributes'] = {}
    
    for attribute, value in re.findall('point(\w{3}) = (\d{1,3})', raw):
        player['Composed attributes'][attribute] = int(value)

### Third and fourth columns — Teams

In [16]:
def teams(response, player):
    
    concepts = [
        'Position',
        'Jersey number',
        'Joined',
        'Loaned from',
        'Contract valid until',
    ]
    
    player['Teams'] = {}
    
    for team_column in response.xpath('//a[contains(@href, "team/")]/ancestor::ul'):

        data = {}
        data['Team ID'] = team_column.xpath('.//@href[contains(., "team/")]').re_first('\d+')

        for concept in concepts:
            raw = team_column.xpath('.//li[contains(label, $concept)]', concept=concept) \
                  .xpath('./span | .').xpath('./text() | a/@href').extract()

            if len(raw) == 0: continue

            clean = text_cleaner(raw)[0]

            if   concept == 'Joined':
                data[concept] = datetime.datetime.strptime(clean, '%b %d, %Y').strftime('%Y-%m-%d')
            elif concept == 'Loaned from':
                data[concept] = re.search('\d+', clean).group(0)
            elif concept == 'Contract valid until':
                # this is because in a few cases the date is like "Dec 31, 2017" instead of "2017"
                data[concept] = clean[-4:]
            else:
                data[concept] = clean
            
        if 'Contract valid until' in data: # team type is "club"
            player['Teams']['Club'] = data
        else: # team type is "national"
            player['Teams']['National'] = data

### 5th to 7th columns — Attributes

In [17]:
def attributes(response, player):

    attributes = {
      'Attacking': [
        'Crossing',
        'Finishing',
        'Heading accuracy',
        'Short passing',
        'Volleys',
      ],
      'Skill': [
         'Dribbling',
         'Curve',
         'Free kick accuracy',
         'Long passing',
         'Ball control',
      ],
      'Movement': [
        'Acceleration',
        'Sprint speed',
        'Agility',
        'Reactions',
        'Balance',
      ],
      'Power': [
        'Shot power',
        'Jumping',
        'Stamina',
        'Strength',
        'Long shots',    
      ],
      'Mentality': [
        'Aggression',
        'Interceptions',
        'Positioning',
        'Vision',
        'Penalties',
        'Composure',
      ],
      'Defending': [
        'Marking',
        'Standing tackle',
        'Sliding tackle',
      ],
      'Goalkeeping': [
        'GK diving',
        'GK handling',
        'GK kicking',
        'GK positioning',
        'GK reflexes',
      ],
    }
    
    player['Attributes'] = {}

    for group in attributes:
        player['Attributes'][group] = {}
        
        for attribute in attributes[group]:
            player['Attributes'][group][attribute] = \
                int(response.xpath('//span[contains(.., $concept)]/text()', concept=attribute).extract_first())

### 8th column — Traits and specialities

In [18]:
def traits_and_specialities(response, player):

    for group in ['Traits', 'Specialities']:
        skills = response.xpath('//div[contains(h5, $concept)]/ul/li/text()', concept=group).extract()
        
        if len(skills) == 0: continue
        
        player[group] = {}
        
        for skill in skills:
            player[group][skill] = True

## Crawler

In [19]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from scrapy.crawler import CrawlerProcess

import logging
import datetime
import scrapy

In [20]:
class PlayerSpider(scrapy.Spider):
    name = 'Player'
    
    start_urls = [
        'https://sofifa.com/players?units=mks&currency=EUR',
    ]
    
    custom_settings = {
        'LOG_LEVEL':   logging.WARNING,
        'FEED_FORMAT': 'json',
        'FEED_URI':    'players_158904.json'
    }
    
    def parse(self, response):
        # follow links to player pages
        for href in response.xpath('//a[contains(@href, "player/")]/@href'):
            yield response.follow(href, self.parse_player)

        # follow pagination links
        for href in response.xpath('//a[contains(@href, "offset=") and contains(text(), "Next")]/@href'):
            yield response.follow(href, self.parse)      

            
    def parse_player(self, response):
        player = {}

        basic(response, player)
        stats(response, player)
        body(response, player)
        teams(response, player)
        attributes(response, player)
        traits_and_specialities(response, player)

        yield player

In [21]:
class TeamSpider(scrapy.Spider):
    name = 'Team'
    
    start_urls = [
        'https://sofifa.com/teams/national?units=mks&currency=EUR',
        'https://sofifa.com/teams/club?units=mks&currency=EUR'
    ]
    
    custom_settings = {
        'LOG_LEVEL':   logging.WARNING,
        'FEED_FORMAT': 'json',
        'FEED_URI':    'teams_158904.json'
    }
    
    def parse(self, response):
        # follow links to player pages
        for href in response.xpath('//a[contains(@href, "team/")]/@href'):
            yield response.follow(href, self.parse_team)

        # follow pagination links
        for href in response.xpath('//a[contains(@href, "offset=") and contains(text(), "Next")]/@href'):
            yield response.follow(href, self.parse)      

            
    def parse_team(self, response):
        team = {}

        team_basic(response, team)
        team_stats(response, team)
        team_lineup(response, team)
        team_info(response, team)

        yield team

In [22]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(PlayerSpider)
process.start()

2017-11-12 19:59:33 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-11-12 19:59:33 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


<Deferred at 0x7f1d3759f278>

In [23]:
import pandas as pd
dfjson = pd.read_json('players_158904.json')
dfjson

Unnamed: 0,Age,Attributes,Birth date,Body type,Height cm,International reputation,Known as,Name,Nationality,Overall rating,...,Release clause €,Skill moves,Specialities,Teams,Traits,Value €,Wage €,Weak foot,Weight kg,Work rate
0,32,"{'Movement': {'Acceleration': 89, 'Sprint spee...",1985-02-05,C. Ronaldo,185,5,Cristiano Ronaldo,C. Ronaldo dos Santos Aveiro,Portugal,94,...,195800000.0,5,,"{'Club': {'Position': 'LS', 'Jersey number': '...","{'Flair': True, 'Speedster': True, 'Skilled dr...",95500000,565000,4,80,"{'Defensive': 'Low', 'Attacking': 'High'}"
1,28,"{'Movement': {'Acceleration': 62, 'Sprint spee...",1988-12-16,Normal,191,4,M. Hummels,Mats Hummels,Germany,88,...,79200000.0,2,,"{'Club': {'Position': 'LCB', 'Jersey number': ...","{'Playmaker': True, 'Long passer': True, 'Tack...",48000000,215000,3,92,"{'Defensive': 'Medium', 'Attacking': 'High'}"
2,31,"{'Movement': {'Acceleration': 64, 'Sprint spee...",1986-02-16,Lean,186,3,D. Godín,Diego Godín,Uruguay,88,...,82000000.0,2,,"{'Club': {'Position': 'LCB', 'Jersey number': ...","{'Power header': True, 'Aerial threat': True, ...",40000000,125000,3,78,"{'Defensive': 'High', 'Attacking': 'Medium'}"
3,35,"{'Movement': {'Acceleration': 63, 'Sprint spee...",1981-10-03,Normal,195,5,Z. Ibrahimović,Zlatan Ibrahimović,Sweden,88,...,50000000.0,4,,"{'Club': {'Position': 'SUB', 'Jersey number': ...","{'Flair': True, 'Technical dribbler': True, 'B...",27000000,240000,4,95,"{'Defensive': 'Low', 'Attacking': 'Medium'}"
4,24,"{'Movement': {'Acceleration': 69, 'Sprint spee...",1992-11-05,Normal,165,3,M. Verratti,Marco Verratti,Italy,87,...,121300000.0,3,,"{'Club': {'Position': 'RCM', 'Jersey number': ...","{'Playmaker': True, 'Acrobat': True, 'Tacticia...",63000000,130000,4,60,"{'Defensive': 'High', 'Attacking': 'Medium'}"
5,31,"{'Movement': {'Acceleration': 72, 'Sprint spee...",1986-01-08,Normal,173,4,David Silva,David Josué Jiménez Silva,Spain,88,...,96200000.0,4,,"{'Club': {'Position': 'LCM', 'Jersey number': ...","{'Flair': True, 'Playmaker': True, 'Avoids usi...",52000000,285000,2,67,"{'Defensive': 'Low', 'Attacking': 'High'}"
6,26,"{'Movement': {'Acceleration': 82, 'Sprint spee...",1991-03-29,Normal,168,3,N. Kanté,N'Golo Kanté,France,87,...,101100000.0,3,"{'Tackling ': True, 'Tactician ': True}","{'Club': {'Position': 'RCM', 'Jersey number': ...",,52500000,190000,3,68,"{'Defensive': 'High', 'Attacking': 'Medium'}"
7,33,"{'Movement': {'Acceleration': 87, 'Sprint spee...",1984-01-23,Normal,180,4,A. Robben,Arjen Robben,Netherlands,88,...,59400000.0,4,,"{'Club': {'Position': 'RM', 'Jersey number': '...","{'Chip shot': True, 'Avoids using weaker foot'...",36000000,225000,2,80,"{'Defensive': 'Low', 'Attacking': 'High'}"
8,32,"{'Movement': {'Acceleration': 39, 'Sprint spee...",1984-07-14,Normal,193,3,S. Handanovič,Samir Handanovič,Slovenia,88,...,57800000.0,1,,"{'Club': {'Position': 'GK', 'Jersey number': '...","{'1-on-1 rush': True, 'Puncher': True, 'GK lon...",34000000,120000,2,92,"{'Defensive': 'Medium', 'Attacking': 'Medium'}"
9,30,"{'Movement': {'Acceleration': 65, 'Sprint spee...",1986-12-26,Lean,188,4,H. Lloris,Hugo Lloris,France,88,...,72200000.0,1,,"{'Club': {'Position': 'GK', 'Jersey number': '...","{'Comes for crosses': True, 'Avoids using weak...",38000000,165000,1,82,"{'Defensive': 'Medium', 'Attacking': 'Medium'}"
