Lucas Bouchard

# NCAA Tournament Project: Web Scraping, graphing, and networking 



In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sb
import requests
import networkx as nx
from bs4 import BeautifulSoup, NavigableString

# Getting the data

Tournament page data

In [2]:
raw = requests.get('http://www.sports-reference.com/cbb/postseason/2017-ncaa.html').text
soup = BeautifulSoup(raw, 'lxml')

Parsing out all the divisions.

In [104]:
east_soup = soup.find_all('div',{'id':'east'})
east_soup[0]
midwest_soup = soup.find_all('div',{'id':'midwest'})
south_soup = soup.find_all('div',{'id':'south'})
west_soup = soup.find_all('div',{'id':'west'})

In [105]:
east_soup[0].find_all('a')[5].text

'Wisconsin'

In [106]:
teams_href_list = list()
for link in east_soup[0].find_all('a'):
    if len(link.text) > 0:
        teams_href_list.append(link['href'])

teams_href_list        

['/cbb/schools/villanova/2017.html',
 '/cbb/boxscores/2017-03-16-villanova.html',
 '/cbb/schools/mount-st-marys/2017.html',
 '/cbb/boxscores/2017-03-16-villanova.html',
 '/cbb/boxscores/2017-03-16-villanova.html',
 '/cbb/schools/wisconsin/2017.html',
 '/cbb/boxscores/2017-03-16-wisconsin.html',
 '/cbb/schools/virginia-tech/2017.html',
 '/cbb/boxscores/2017-03-16-wisconsin.html',
 '/cbb/boxscores/2017-03-16-wisconsin.html',
 '/cbb/schools/virginia/2017.html',
 '/cbb/boxscores/2017-03-16-virginia.html',
 '/cbb/schools/north-carolina-wilmington/2017.html',
 '/cbb/boxscores/2017-03-16-virginia.html',
 '/cbb/boxscores/2017-03-16-virginia.html',
 '/cbb/schools/florida/2017.html',
 '/cbb/boxscores/2017-03-16-florida.html',
 '/cbb/schools/east-tennessee-state/2017.html',
 '/cbb/boxscores/2017-03-16-florida.html',
 '/cbb/boxscores/2017-03-16-florida.html',
 '/cbb/schools/southern-methodist/2017.html',
 '/cbb/boxscores/2017-03-17-southern-methodist.html',
 '/cbb/schools/southern-california/2017.

# Parsing the HTML to get all the teams in each division


In [107]:
def get_teams(division_soup):
    teams_href_list = list()
    for link in division_soup[0].find_all('a'):
        if len(link.text) > 0 and 'schools' in link['href']:
            teams_href_list.append(link)
            
    return teams_href_list


# Using 'soup' to put each division in list to see all teams in 2017 tournament. 



In [108]:
east_teams_2017 = get_teams(east_soup)
midwest_teams_2017 = get_teams(midwest_soup)
south_teams_2017 = get_teams(south_soup)
west_teams_2017 = get_teams(west_soup)
teams_2017 = east_teams_2017 + midwest_teams_2017 + south_teams_2017 + west_teams_2017
len(teams_2017)






124

In [109]:
cleaned_teams = list()
for team in teams_2017:
    if team not in cleaned_teams:
        cleaned_teams.append(team)
len(cleaned_teams)        

64

# Looping through 'teams_2017' to make 'tournament_teams' by applying the '.text' to each element which extracts the cleaned name of the team and appends it to new list. 



In [110]:
tournament_teams = list()
for team in teams_2017:
    if team.text not in tournament_teams:
        tournament_teams.append(team.text)
tournament_teams        
        
        
        

['Villanova',
 "Mount St. Mary's",
 'Wisconsin',
 'Virginia Tech',
 'Virginia',
 'North Carolina-Wilmington',
 'Florida',
 'ETSU',
 'SMU',
 'USC',
 'Baylor',
 'New Mexico State',
 'South Carolina',
 'Marquette',
 'Duke',
 'Troy',
 'Kansas',
 'UC-Davis',
 'Miami (FL)',
 'Michigan State',
 'Iowa State',
 'Nevada',
 'Purdue',
 'Vermont',
 'Creighton',
 'Rhode Island',
 'Oregon',
 'Iona',
 'Michigan',
 'Oklahoma State',
 'Louisville',
 'Jacksonville State',
 'UNC',
 'Texas Southern',
 'Arkansas',
 'Seton Hall',
 'Minnesota',
 'Middle Tennessee',
 'Butler',
 'Winthrop',
 'Cincinnati',
 'Kansas State',
 'UCLA',
 'Kent State',
 'Dayton',
 'Wichita State',
 'Kentucky',
 'Northern Kentucky',
 'Gonzaga',
 'South Dakota State',
 'Northwestern',
 'Vanderbilt',
 'Notre Dame',
 'Princeton',
 'West Virginia',
 'Bucknell',
 'Maryland',
 'Xavier',
 'Florida State',
 'Florida Gulf Coast',
 "Saint Mary's (CA)",
 'VCU',
 'Arizona',
 'North Dakota']

# Getting the `href` out of a single team in `teams_2017`.

In [111]:
teams_2017[0]['href']

'/cbb/schools/villanova/2017.html'

# Using the `replace` string function to update the `href' so we can get 2017-schedual

In [112]:
'http://www.sports-reference.com' + teams_2017[0]['href'].replace('2017','2017-schedule')

'http://www.sports-reference.com/cbb/schools/villanova/2017-schedule.html'

# Making a `_url` variable that combines the domain ("http://www.sports-reference.com/") and the `href` updated to contain "2017-schedule.html". Then used requests' `get` and `text` methods to get the raw HTML, saving it as `team_raw.` Madee `team_soup` out of `team_raw,` and then found specific table containing the schedule 



In [113]:
raw = requests.get('http://www.sports-reference.com/cbb/schools/virginia/2017-schedule.html').text
soup = BeautifulSoup(raw, 'lxml')

schedule_table = soup.find_all('table',{'id':'schedule','class':"sortable stats_table"})
row = schedule_table[0].find_all('tr')[1]
row
#.find_all('td')

<tr><th class="right " data-stat="g" scope="row">1</th><td class="left " csk="2016-11-11" data-stat="date_game"><a href="/cbb/boxscores/2016-11-11-north-carolina-greensboro.html">Fri, Nov 11, 2016</a></td><td class="left " data-stat="time_game">7:00 pm/est</td><td class="left " data-stat="network"></td><td class="left " data-stat="game_type">REG</td><td class="left " data-stat="game_location">@</td><td class="left " data-stat="opp_name"><a href="/cbb/schools/north-carolina-greensboro/2017.html">North Carolina-Greensboro</a></td><td class="left " data-stat="conf_abbr"><a href="/cbb/conferences/southern/2017.html" title="Southern Conference">Southern</a></td><td class="left " data-stat="game_result">W</td><td class="right " data-stat="pts">76</td><td class="right " data-stat="opp_pts">51</td><td class="center " data-stat="overtimes"></td><td class="right " data-stat="wins">1</td><td class="right " data-stat="losses">0</td><td class="left " csk="1" data-stat="game_streak">W 1</td><td clas

In [114]:
schedule_table[0].find_all('td',{'data-stat':'date_game'})[0].text

'Fri, Nov 11, 2016'

# Parsing the `schedule_table`



In [115]:
schedule_table[0].find_all('td',{'data-stat':'opp_name'})

[<td class="left " data-stat="opp_name"><a href="/cbb/schools/north-carolina-greensboro/2017.html">North Carolina-Greensboro</a></td>,
 <td class="left " data-stat="opp_name"><a href="/cbb/schools/st-francis-ny/2017.html">St. Francis (NY)</a></td>,
 <td class="left " data-stat="opp_name"><a href="/cbb/schools/yale/2017.html">Yale</a></td>,
 <td class="left " data-stat="opp_name"><a href="/cbb/schools/grambling/2017.html">Grambling</a></td>,
 <td class="left " data-stat="opp_name"><a href="/cbb/schools/iowa/2017.html">Iowa</a></td>,
 <td class="left " data-stat="opp_name"><a href="/cbb/schools/providence/2017.html">Providence</a></td>,
 <td class="left " data-stat="opp_name"><a href="/cbb/schools/ohio-state/2017.html">Ohio State</a></td>,
 <td class="left " data-stat="opp_name"><a href="/cbb/schools/west-virginia/2017.html">West Virginia</a> <span class="note">(25)</span></td>,
 <td class="left " data-stat="opp_name"><a href="/cbb/schools/east-carolina/2017.html">East Carolina</a></td>,

# Parsing out the opponents and saving list of values as `opponents`.

In [116]:
opponents = list()
for opp in schedule_table[0].find_all('td',{'data-stat':'opp_name'}):
    opponents.append(opp.text)
opponents    

['North Carolina-Greensboro',
 'St. Francis (NY)',
 'Yale',
 'Grambling',
 'Iowa',
 'Providence',
 'Ohio State',
 'West Virginia\xa0(25)',
 'East Carolina',
 'Robert Morris',
 'University of California',
 'Louisville\xa0(6)',
 'Florida State\xa0(20)',
 'Pittsburgh',
 'Wake Forest',
 'Clemson',
 'Boston College',
 'Georgia Tech',
 'Notre Dame\xa0(14)',
 'Villanova\xa0(1)',
 'Virginia Tech',
 'Syracuse',
 'Louisville\xa0(4)',
 'Virginia Tech',
 'Duke\xa0(12)',
 'North Carolina\xa0(10)',
 'Miami (FL)',
 'North Carolina State',
 'North Carolina\xa0(5)',
 'Pittsburgh',
 'Pittsburgh',
 'Notre Dame\xa0(22)',
 'North Carolina-Wilmington',
 'Florida\xa0(20)']

# Parsing out the results and saving list of values as `results`.

In [117]:
results = list()
for sult in schedule_table[0].find_all('td',{'data-stat':'game_result'}):
    results.append(sult.text)
results    

['W',
 'W',
 'W',
 'W',
 'W',
 'W',
 'W',
 'L',
 'W',
 'W',
 'W',
 'W',
 'L',
 'L',
 'W',
 'W',
 'W',
 'W',
 'W',
 'L',
 'W',
 'L',
 'W',
 'L',
 'L',
 'L',
 'L',
 'W',
 'W',
 'W',
 'W',
 'L',
 'W',
 'L']

# Parsing out the team score and saving list of values as `team_scores`.

In [118]:
team_score = list()
for point in schedule_table[0].find_all('td',{'data-stat':'pts'}):
    team_score.append(point.text)
team_score    

['76',
 '72',
 '62',
 '90',
 '74',
 '63',
 '63',
 '57',
 '76',
 '79',
 '56',
 '61',
 '58',
 '76',
 '79',
 '77',
 '71',
 '62',
 '71',
 '59',
 '71',
 '62',
 '71',
 '78',
 '55',
 '41',
 '48',
 '70',
 '53',
 '67',
 '75',
 '58',
 '76',
 '39']

# Parsing out the opponent's score and saving list of values as `opp_scores`.

In [119]:
opp_score = list()
for point in schedule_table[0].find_all('td',{'data-stat':'opp_pts'}):
    opp_score.append(point.text)
opp_score    

['51',
 '32',
 '38',
 '34',
 '41',
 '52',
 '61',
 '66',
 '53',
 '39',
 '52',
 '53',
 '60',
 '88',
 '62',
 '73',
 '54',
 '49',
 '54',
 '61',
 '48',
 '66',
 '55',
 '80',
 '65',
 '65',
 '54',
 '55',
 '43',
 '42',
 '63',
 '71',
 '71',
 '65']

In [120]:
list1 = ['a','b','c']
list2 = [0,1,2]
list3 = ['alpha','beta','gamma']

zip(list1,list2,list3)

<zip at 0x20fba73fdc8>

In [121]:
list(zip(list1,list2,list3))

[('a', 0, 'alpha'), ('b', 1, 'beta'), ('c', 2, 'gamma')]

In [122]:
list1 = opponents
list2 = results
list3 = team_score
list4 = opp_score

team_results=list(zip(list1,list2,list3,list4))
team_results

[('North Carolina-Greensboro', 'W', '76', '51'),
 ('St. Francis (NY)', 'W', '72', '32'),
 ('Yale', 'W', '62', '38'),
 ('Grambling', 'W', '90', '34'),
 ('Iowa', 'W', '74', '41'),
 ('Providence', 'W', '63', '52'),
 ('Ohio State', 'W', '63', '61'),
 ('West Virginia\xa0(25)', 'L', '57', '66'),
 ('East Carolina', 'W', '76', '53'),
 ('Robert Morris', 'W', '79', '39'),
 ('University of California', 'W', '56', '52'),
 ('Louisville\xa0(6)', 'W', '61', '53'),
 ('Florida State\xa0(20)', 'L', '58', '60'),
 ('Pittsburgh', 'L', '76', '88'),
 ('Wake Forest', 'W', '79', '62'),
 ('Clemson', 'W', '77', '73'),
 ('Boston College', 'W', '71', '54'),
 ('Georgia Tech', 'W', '62', '49'),
 ('Notre Dame\xa0(14)', 'W', '71', '54'),
 ('Villanova\xa0(1)', 'L', '59', '61'),
 ('Virginia Tech', 'W', '71', '48'),
 ('Syracuse', 'L', '62', '66'),
 ('Louisville\xa0(4)', 'W', '71', '55'),
 ('Virginia Tech', 'L', '78', '80'),
 ('Duke\xa0(12)', 'L', '55', '65'),
 ('North Carolina\xa0(10)', 'L', '41', '65'),
 ('Miami (FL)', 

# Crawling all of the teams' tables

Here I create a loop that goes through each team in `teams_2017` and does all of the steps above in each pass.

0. Before the loop starts, has an empty list `all_results`
1. Checks to make sure the `team.text` isn't empty, skips urls that point to "tbd", or handles index position errors from pages with empty tables.
2. Takes the team object the url with the `href` replacement
3. Uses requests' `get` and `text` methods to get the raw HTML
4. Turns the raw HTML into Soup
5. Finds the table
6. Finds the dates, opponents, results, team score, and opponent scores
7. Zips them all together into `team_results`
8. For each result in `team_results` appends it to `all_results`


In [123]:
len(teams_2017)
cleaned_teams[0]['href'].replace('2017.html' , '2017_schedule.html')

'/cbb/schools/villanova/2017_schedule.html'

In [124]:
all_teams = list()
for team in cleaned_teams:
    url = 'http://www.sports-reference.com' + team['href'].replace('2017.html','2017-schedule.html')
    raw = requests.get(url).text
    soup = BeautifulSoup(raw,'lxml')
    schedule_table = soup.find_all('table',{'id':'schedule','class':'sortable stats_table'})
    all_results = list()
    all_results1 = list()
    result = list()
    all_results3 = list()
    all_results4 = list()
    for opp in schedule_table[0].find_all('td',{'data-stat':'date_game'}):
        all_results.append(opp.text)
    for opp in schedule_table[0].find_all('td',{'data-stat':'opp_name'}):
        all_results1.append(opp.text)
    for opp in schedule_table[0].find_all('td',{'data-stat':'game_result'}):
        result.append(opp.text)   
    for opp in schedule_table[0].find_all('td',{'data-stat':'pts'}):
        all_results3.append(opp.text)     
    for opp in schedule_table[0].find_all('td',{'data-stat':'opp_pts'}):
        all_results4.append(opp.text)      
    
    home_team = [team.text] * len(result)
    team_result = list(zip(home_team, all_results, all_results1, result, all_results3, all_results4))
    all_teams.append(team_result)
all_teams    
        

[[('Villanova', 'Fri, Nov 11, 2016', 'Lafayette', 'W', '88', '48'),
  ('Villanova', 'Mon, Nov 14, 2016', 'Purdue\xa0(15)', 'W', '79', '76'),
  ('Villanova', 'Thu, Nov 17, 2016', 'Western Michigan', 'W', '76', '65'),
  ('Villanova', 'Fri, Nov 18, 2016', 'Wake Forest', 'W', '96', '77'),
  ('Villanova', 'Sun, Nov 20, 2016', 'Central Florida', 'W', '67', '57'),
  ('Villanova', 'Wed, Nov 23, 2016', 'College of Charleston', 'W', '63', '47'),
  ('Villanova', 'Tue, Nov 29, 2016', 'Pennsylvania', 'W', '82', '57'),
  ('Villanova', 'Sat, Dec 3, 2016', "Saint Joseph's", 'W', '88', '57'),
  ('Villanova', 'Tue, Dec 6, 2016', 'La Salle', 'W', '89', '79'),
  ('Villanova', 'Sat, Dec 10, 2016', 'Notre Dame\xa0(23)', 'W', '74', '66'),
  ('Villanova', 'Tue, Dec 13, 2016', 'Temple', 'W', '78', '57'),
  ('Villanova', 'Wed, Dec 21, 2016', 'American', 'W', '90', '48'),
  ('Villanova', 'Wed, Dec 28, 2016', 'DePaul', 'W', '68', '65'),
  ('Villanova', 'Sat, Dec 31, 2016', 'Creighton\xa0(10)', 'W', '80', '70'),
 

In [125]:
reduced_results = list()
for team in all_teams:
    for result in team:
        if 'W' in result:
            reduced_results.append(result)
            

        
len(all_teams), len(reduced_results)

(64, 1646)

In [126]:
tournament_results = list()
for result in reduced_results:
    if result[2] in tournament_teams:
        tournament_results.append(result)
tournament_results        

[('Villanova', 'Sat, Jan 7, 2017', 'Marquette', 'W', '93', '81'),
 ('Villanova', 'Mon, Jan 16, 2017', 'Seton Hall', 'W', '76', '46'),
 ('Villanova', 'Sat, Feb 18, 2017', 'Seton Hall', 'W', '92', '70'),
 ('Villanova', 'Fri, Mar 10, 2017', 'Seton Hall', 'W', '55', '53'),
 ('Villanova', 'Sat, Mar 11, 2017', 'Creighton', 'W', '74', '60'),
 ('Villanova', 'Thu, Mar 16, 2017', "Mount St. Mary's", 'W', '76', '56'),
 ('Wisconsin', 'Sat, Dec 10, 2016', 'Marquette', 'W', '93', '84'),
 ('Wisconsin', 'Tue, Jan 17, 2017', 'Michigan', 'W', '68', '64'),
 ('Wisconsin', 'Sat, Jan 21, 2017', 'Minnesota', 'W', '78', '76'),
 ('Wisconsin', 'Sun, Mar 5, 2017', 'Minnesota', 'W', '66', '49'),
 ('Wisconsin', 'Sat, Mar 11, 2017', 'Northwestern', 'W', '76', '48'),
 ('Wisconsin', 'Thu, Mar 16, 2017', 'Virginia Tech', 'W', '84', '74'),
 ('Virginia Tech', 'Wed, Nov 30, 2016', 'Michigan', 'W', '73', '70'),
 ('Virginia Tech', 'Mon, Feb 27, 2017', 'Miami (FL)', 'W', '66', '61'),
 ('Virginia', 'Wed, Feb 1, 2017', 'Virgi

# Making a directed graph



In [28]:
g = nx.DiGraph()

for (team, opponent, date, score, opp_score) in tournament_results:
    differential = int(score) - int(opp_score)
    if differential > 0:
        if g.has_edge(team,opponent):
            g[team][opponent]['weight'] += differential
        else:
            g.add_edge(team, opponent, weight = abs(differential))
    else:
        print('Negative point differential for: {0}-{1}\n'.format(team,opponent))
        if g.has_edge(team,opponent):
            g[team][opponent]['weight'] += differential
        else:
            g.add_edge(team, opponent, weight = abs(differential))
        
print("There are {0} nodes and {1} edges in the network".format(g.number_of_nodes(), g.number_of_edges()))

nx.write_gexf(g,'tournament_schedule.gexf')

Negative point differential for: Creighton-Marquette

Negative point differential for: Michigan-UCLA

Negative point differential for: Texas Southern-Louisville

Negative point differential for: Winthrop-Florida State

There are 58 nodes and 222 edges in the network


In [60]:
import networkx as nx

In [61]:
g = nx.Graph()

In [62]:
g.add_node('a')

In [63]:
len(g)

1

In [64]:
g.nodes()

['a']

In [79]:
g.add_nodes_from(['b','c','d'])

In [80]:
g.nodes()

['b', 'c', 'a', 'd']

In [81]:
len(g)

4

In [82]:
g.remove_edge('a','b')

In [83]:
g.edges()

[('c', 'a'), ('a', 'd')]

In [98]:
g.add_edges_from([('a','b'),('a','c'),('a','d')])

In [99]:
g.edges()

[('b', 'a'), ('c', 'a'), ('a', 'd')]

In [104]:
g.add_edge('d','a')
g.edges()

[('b', 'a'), ('c', 'a'), ('a', 'd')]

In [None]:
g.edges()
np.array(nx)

# Making a DiGraph

In [100]:
dg = nx.DiGraph()

In [101]:
dg.add_edges_from([('a','b'),('a','c'),('a','d')])

In [102]:
dg.nodes()

['b', 'c', 'a', 'd']

In [103]:
dg.edges()

[('a', 'b'), ('a', 'c'), ('a', 'd')]

In [109]:
dg.add_edge('c','a')

In [110]:
dg.edges()

[('c', 'a'), ('a', 'b'), ('a', 'c'), ('a', 'd')]