In [1]:
# load packages
import requests
from lxml import html
from collections import defaultdict
import os
import pandas as pd
import networkx as nx

In [2]:
# scrape links to event pages
with requests.Session() as session:
    cont = True
    i = 1
    event_links = []
    while cont:
        url = 'http://ufcstats.com/statistics/events/completed?page={}'.format(i)
        r = session.get(url)
        tree = html.fromstring(r.content)
        links = tree.xpath('//tr[@class="b-statistics__table-row"]/td/i/a/@href')
        if len(links) > 0:
            event_links += links
            i += 1
        else:
            cont = False

In [6]:
# scrape individual fights
fights = pd.DataFrame()
with requests.Session() as session:
    for url in event_links:
        r = session.get(url)
        tree = html.fromstring(r.content)
        links = tree.xpath('//tr/@data-link')
        
        fighters = tree.xpath('//tr/td[2]/p/a/text()')
        fighters = [(fighters[i].strip('\n').strip(' ').strip('\n'), fighters[i+1].strip('\n').strip(' ').strip('\n')) for i in range(0, len(fighters), 2)]
        fighter_0 = [f[0] for f in fighters]
        fighter_1 = [f[1] for f in fighters]
        
        table = pd.read_html(r.text)[0].drop(['Fighter'], axis = 1)
        
        date = tree.xpath('//li[@class="b-list__box-list-item"][1]/text()')[1].strip('\n').strip(' ').strip('\n')
        location = tree.xpath('//li[@class="b-list__box-list-item"][2]/text()')[1].strip('\n').strip(' ').strip('\n')
        event = tree.xpath('//h2/span/text()')[0].strip('\n').strip(' ').strip('\n')

        table['date'] = date
        table['location'] = location
        table['event'] = event
        
        table['fighter_0'] = fighter_0
        table['fighter_1'] = fighter_1
        
        fights = pd.concat([fights, table])
    

In [7]:
# abbreviate weightclasses
fights = fights.reset_index(drop = True)

weight_dict = {
    'Heavyweight' : 'HW',
    'Light Heavyweight' : 'LHW',
    'Middleweight' : 'MW',
    'Welterweight' : 'WW',
    'Lightweight' : 'LW',
    'Featherweight' : 'FW',
    'Bantamweight' : 'BW',
    'Flyweight' : 'FLW',
    "Women's Featherweight" : 'WFW',
    "Women's Bantamweight" : 'WBW',
    "Women's Flyweight" : 'WFLW',
    "Women's Strawweight" : 'WSW',
    'Catch Weight' : 'CW',
    'Super Heavyweight' : 'SHW',
    'Open Weight' : 'OW'
}

fights['Weight class'] = fights['Weight class'].apply(lambda x : weight_dict[x])
fights

Unnamed: 0,W/L,Str,Td,Sub,Pass,Weight class,Method,Round,Time,date,location,event,fighter_0,fighter_1
0,win,68 25,1 0,0 0,3 0,HW,KO/TKO Punches,2,3:00,"May 16, 2020","Jacksonville, Florida, USA",UFC Fight Night: Overeem vs. Harris,Alistair Overeem,Walt Harris
1,win,84 90,1 0,0 0,1 0,WSW,S-DEC,3,5:00,"May 16, 2020","Jacksonville, Florida, USA",UFC Fight Night: Overeem vs. Harris,Claudia Gadelha,Angela Hill
2,win,79 80,1 0,0 0,0 0,FW,S-DEC,3,5:00,"May 16, 2020","Jacksonville, Florida, USA",UFC Fight Night: Overeem vs. Harris,Dan Ige,Edson Barboza
3,win,66 41,0 0,0 0,0 0,MW,U-DEC,3,5:00,"May 16, 2020","Jacksonville, Florida, USA",UFC Fight Night: Overeem vs. Harris,Krzysztof Jotko,Eryk Anders
4,win,101 92,0 2,0 0,0 1,FW,U-DEC,3,5:00,"May 16, 2020","Jacksonville, Florida, USA",UFC Fight Night: Overeem vs. Harris,Song Yadong,Marlon Vera
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5591,win,11 0,0 0,0 0,0 0,OW,KO/TKO,1,0:59,"November 12, 1993","Denver, Colorado, USA",UFC 1: The Beginning,Gerard Gordeau,Kevin Rosier
5592,win,1 4,1 0,2 0,0 0,OW,SUB Heel Hook,1,1:49,"November 12, 1993","Denver, Colorado, USA",UFC 1: The Beginning,Ken Shamrock,Patrick Smith
5593,win,0 0,1 0,0 0,2 0,OW,SUB Other,1,2:18,"November 12, 1993","Denver, Colorado, USA",UFC 1: The Beginning,Royce Gracie,Art Jimmerson
5594,win,15 12,0 0,0 0,0 0,OW,KO/TKO,1,4:20,"November 12, 1993","Denver, Colorado, USA",UFC 1: The Beginning,Kevin Rosier,Zane Frazier


In [8]:
fights.to_csv('fights.csv')