In [1]:
from urllib import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import calendar
from string import ascii_lowercase
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

__all__ = ['tqdm', 'trange']

import sys
import time


def format_interval(t):
    mins, s = divmod(int(t), 60)
    h, m = divmod(mins, 60)
    if h:
        return '%d:%02d:%02d' % (h, m, s)
    else:
        return '%02d:%02d' % (m, s)


def format_meter(n, total, elapsed):
    # n - number of finished iterations
    # total - total number of iterations, or None
    # elapsed - number of seconds passed since start
    if n > total:
        total = None
    
    elapsed_str = format_interval(elapsed)
    rate = '%5.2f' % (n / elapsed) if elapsed else '?'
    
    if total:
        frac = float(n) / total
        
        N_BARS = 10
        bar_length = int(frac*N_BARS)
        bar = '#'*bar_length + '-'*(N_BARS-bar_length)
        
        percentage = '%3d%%' % (frac * 100)
        
        left_str = format_interval(elapsed / n * (total-n)) if n else '?'
        
        return '|%s| %d/%d %s [elapsed: %s left: %s, %s iters/sec]' % (
            bar, n, total, percentage, elapsed_str, left_str, rate)
    
    else:
        return '%d [elapsed: %s, %s iters/sec]' % (n, elapsed_str, rate)


class StatusPrinter(object):
    def __init__(self, file):
        self.file = file
        self.last_printed_len = 0
    
    def print_status(self, s):
        self.file.write('\r'+s+' '*max(self.last_printed_len-len(s), 0))
        self.file.flush()
        self.last_printed_len = len(s)


def tqdm(iterable, desc='', total=None, leave=False, file=sys.stderr,
         mininterval=0.5, miniters=1):
    """
    Get an iterable object, and return an iterator which acts exactly like the
    iterable, but prints a progress meter and updates it every time a value is
    requested.
    'desc' can contain a short string, describing the progress, that is added
    in the beginning of the line.
    'total' can give the number of expected iterations. If not given,
    len(iterable) is used if it is defined.
    'file' can be a file-like object to output the progress message to.
    If leave is False, tqdm deletes its traces from screen after it has
    finished iterating over all elements.
    If less than mininterval seconds or miniters iterations have passed since
    the last progress meter update, it is not updated again.
    """
    if total is None:
        try:
            total = len(iterable)
        except TypeError:
            total = None
    
    prefix = desc+': ' if desc else ''
    
    sp = StatusPrinter(file)
    sp.print_status(prefix + format_meter(0, total, 0))
    
    start_t = last_print_t = time.time()
    last_print_n = 0
    n = 0
    for obj in iterable:
        yield obj
        # Now the object was created and processed, so we can print the meter.
        n += 1
        if n - last_print_n >= miniters:
            # We check the counter first, to reduce the overhead of time.time()
            cur_t = time.time()
            if cur_t - last_print_t >= mininterval:
                sp.print_status(prefix + format_meter(n, total, cur_t-start_t))
                last_print_n = n
                last_print_t = cur_t
    
    if not leave:
        sp.print_status('')
        sys.stdout.write('\r')
    else:
        if last_print_n < n:
            cur_t = time.time()
            sp.print_status(prefix + format_meter(n, total, cur_t-start_t))
        file.write('\n')


def trange(*args, **kwargs):
    """A shortcut for writing tqdm(range()) on py3 or tqdm(xrange()) on py2"""
    try:
        f = xrange
    except NameError:
        f = range
    
    return tqdm(f(*args), **kwargs)

In [2]:
pages = []
def get_pages():
    for i in range(1996, 2017):
        pages.append("http://basketball.realgm.com/nba/draft/past_drafts/{}".format(i))
    return pages
get_pages()            

['http://basketball.realgm.com/nba/draft/past_drafts/1996',
 'http://basketball.realgm.com/nba/draft/past_drafts/1997',
 'http://basketball.realgm.com/nba/draft/past_drafts/1998',
 'http://basketball.realgm.com/nba/draft/past_drafts/1999',
 'http://basketball.realgm.com/nba/draft/past_drafts/2000',
 'http://basketball.realgm.com/nba/draft/past_drafts/2001',
 'http://basketball.realgm.com/nba/draft/past_drafts/2002',
 'http://basketball.realgm.com/nba/draft/past_drafts/2003',
 'http://basketball.realgm.com/nba/draft/past_drafts/2004',
 'http://basketball.realgm.com/nba/draft/past_drafts/2005',
 'http://basketball.realgm.com/nba/draft/past_drafts/2006',
 'http://basketball.realgm.com/nba/draft/past_drafts/2007',
 'http://basketball.realgm.com/nba/draft/past_drafts/2008',
 'http://basketball.realgm.com/nba/draft/past_drafts/2009',
 'http://basketball.realgm.com/nba/draft/past_drafts/2010',
 'http://basketball.realgm.com/nba/draft/past_drafts/2011',
 'http://basketball.realgm.com/nba/draft

In [4]:
urls = []
for element in tqdm(pages[0]):
    html = urlopen(element)  # get the html
    soup = BeautifulSoup(html, "lxml")
    table = soup.find_all('table')
    for stuff in table:
        rows = stuff.find_all('tr')[1:]
        for text in rows:
            links = text.find('a')
            print links

|----------| 0/55   0% [elapsed: 00:00 left: ?, ? iters/sec]

IOError: [Errno 2] The system cannot find the file specified: 'h'

In [3]:
urls = []
for element in tqdm(pages):
    html = urlopen(element)  # get the html
    soup = BeautifulSoup(html, "lxml")
    table = soup.find_all('table')
    for stuff in table:
        rows = stuff.find_all('tr')[1:]
        for text in rows:
            links = text.find('a').get('href')
            urls.append("http://basketball.realgm.com{}".format(links))

|#####-----| 11/21  52% [elapsed: 00:14 left: 00:12,  0.78 iters/sec]

KeyboardInterrupt: 

In [32]:
months = {"Jan": "January", 
          "Feb": "February", 
          "Mar": "March", 
          "Apr": "April", 
          "May": "May", 
          "Jun": "June", 
          "Jul": "July", 
          "Aug": "August",
          "Sep": "September",
          "Oct": "October",
          "Nov": "November",
          "Dec": "December"}      

In [33]:
test_url = "http://basketball.realgm.com/player/AJ-Price/Summary/1656"
html = urlopen(test_url) # get the html
soup = BeautifulSoup(html, "lxml")
searchtext = re.compile(r'NCAA Season Stats - Totals',re.IGNORECASE)
foundtext = soup.find('h2',text=searchtext)
table = foundtext.findNext('table')
rows = table.find_all('tr')[0]
column_headers = ([th.getText() for th in rows.find_all('th')])

In [34]:
data = []
url_data = []
name_data = []
pos_data = []
ht_data = []
wt_data = []
birthday_data = []

for url in tqdm(urls):
    html = urlopen(url)  # get the html
    soup = BeautifulSoup(html, "lxml")
    span = soup.find('span', class_="feature")
    if span:
        pos = span.get_text()
    else:
        pass
    h2 = soup.find('h2')
    if h2:
        nameposnum = h2.get_text()
        sep = '#'
        if sep in nameposnum:
            pre = nameposnum.split(sep, 1)[0]
            namepos = pre.strip()
            def rreplace(s, old, new, occurrence):
                li = s.rsplit(old, occurrence)
                return new.join(li)
            pre = rreplace(namepos, pos, '', 1)
            name = pre.strip()
        else:
            def rreplace(s, old, new, occurrence):
                li = s.rsplit(old, occurrence)
                return new.join(li)
            pre = rreplace(nameposnum, pos, '', 1)
            name = pre.strip()
            
    else:
        pass

    div = soup.find('div', class_='half-column-left')
    if div:
        searchtext = re.compile(r'Height:')
        foundtext = div.find('strong', text=searchtext)
        if foundtext:
            p = foundtext.parent
            heightweight = p.get_text()
            sep2 = '('
            post2 = heightweight.split(sep2, 1)[1]
            sep3 = 'cm)'
            ht = post2.split(sep3, 1)[0]
            weight = heightweight[-6:]
            sep4 = 'kg)'
            wt = weight.split(sep4, 1)[0]
            searchtext1 = re.compile(r'Born:')
            foundtext1 = div.find('strong', text=searchtext1)
            if foundtext1:
                p1 = foundtext1.parent
                born= p1.get_text()
                sep5 = ':'
                post5 = born.split(sep5, 1)[1]
                born1 = post5.strip()
                sep6 = ' '
                month = born1.split(sep6, 1)[0]
                month1 = months[month]
                monthdict = dict((v,k) for k,v in enumerate(calendar.month_name))
                value = monthdict[month1]
                month_dec = (float(value) - 1)/12
                sep7 = ' '
                post7 = born1.split(sep7, 1)[1]
                sep8 = ','
                day = post7.split(sep8, 1)[0]
                day_dec = (float(day) - 1)/365
                yearplus = post7.split(sep8, 1)[1]
                sep9 = ' ('
                year_raw = yearplus.split(sep9, 1)[0]
                year = year_raw.strip()
                year_dec = float(year) + month_dec + day_dec
                searchtext2 = re.compile(r'NCAA Season Stats - Totals')
                foundtext2 = soup.find('h2', text=searchtext2)
                if foundtext2:
                    table = foundtext2.findNext('table')
                    rows = table.find_all('tr')[1:]
                    data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
                    for row in rows:
                        url_data.append(url)
                        name_data.append(name)
                        pos_data.append(pos)
                        ht_data.append(ht)
                        wt_data.append(wt)
                        birthday_data.append(year_dec)
                else:
                    pass
#                 searchtext3 = re.compile(r'International Regular Season Stats - Totals')
#                 foundtext3 = soup.find('h2', text=searchtext3)
#                 if foundtext3:
#                     table = foundtext3.findNext('table')
#                     rows = table.find_all('tr')[1:]
#                     data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
#                     for row in rows:
#                         url_data.append(url)
#                         name_data.append(name)
#                         pos_data.append(pos)
#                         ht_data.append(ht)
#                         wt_data.append(wt)
#                         birthday_data.append(year_dec)
#                 else:
#                     pass
#                 searchtext4 = re.compile(r'FIBA Junior Team Events Stats')
#                 foundtext4 = soup.find('h2', text=searchtext4)
#                 if foundtext4:
#                     table = foundtext4.findNext('table')
#                     rows = table.find_all('tr')[1:]
#                     data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
#                     for row in rows:
#                         url_data.append(url)
#                         name_data.append(name)
#                         pos_data.append(pos)
#                         ht_data.append(ht)
#                         wt_data.append(wt)
#                         birthday_data.append(year_dec)
#                 else:
#                     pass
#                 searchtext5 = re.compile(r'FIBA Senior Team Events Stats')
#                 foundtext5 = soup.find('h2', text=searchtext5)
#                 if foundtext5:
#                     table = foundtext5.findNext('table')
#                     rows = table.find_all('tr')[1:]
#                     data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
#                     for row in rows:
#                         url_data.append(url)
#                         name_data.append(name)
#                         pos_data.append(pos)
#                         ht_data.append(ht)
#                         wt_data.append(wt)
#                         birthday_data.append(year_dec)
#                 else:
#                     pass
            else:
                pass
        else:
            pass
    else:
        pass
  
df = pd.DataFrame(data, columns=column_headers)
df['Name'] = name_data
df['Url'] = url_data
df['Position'] = pos_data
df['Height'] = ht_data
df['Weight'] = wt_data
df['Born'] = birthday_data
df['League'] = 'NCAA'
df1 = df[df['Season'] != 'AVERAGES']
df2 = df1[df1['Season'] != 'TOTAL']
df3 = df2[df2['Season'] != 'CAREER']
# df2['Season'] = df2['Year']
df3['Weight'] = df3['Weight'].str.replace('[^\w\s]','')
df3['Season'] = df3['Season'].str.replace('[^\w\s]','')
df3['Season'] = df3['Season'].map(lambda x: str(x)[:4])
df3['Season'] = pd.DataFrame(df3['Season'], dtype='int')
df3['Season'] += 1
df4 = df3.loc[df3['MIN'] != '-']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [35]:
test_url = "http://basketball.realgm.com/player/Dragan-Bender/Summary/41582"
html = urlopen(test_url) # get the html
soup = BeautifulSoup(html, "lxml")
searchtext = re.compile(r'International Regular Season Stats - Totals',re.IGNORECASE)
foundtext = soup.find('h2',text=searchtext)
table = foundtext.findNext('table')
rows = table.find_all('tr')[0]
column_headers = ([th.getText() for th in rows.find_all('th')])

In [36]:
data = []
url_data = []
name_data = []
pos_data = []
ht_data = []
wt_data = []
birthday_data = []

for url in tqdm(urls):
    html = urlopen(url)  # get the html
    soup = BeautifulSoup(html, "lxml")
    span = soup.find('span', class_="feature")
    if span:
        pos = span.get_text()
    else:
        pass
    h2 = soup.find('h2')
    if h2:
        nameposnum = h2.get_text()
        sep = '#'
        if sep in nameposnum:
            pre = nameposnum.split(sep, 1)[0]
            namepos = pre.strip()
            def rreplace(s, old, new, occurrence):
                li = s.rsplit(old, occurrence)
                return new.join(li)
            pre = rreplace(namepos, pos, '', 1)
            name = pre.strip()
        else:
            def rreplace(s, old, new, occurrence):
                li = s.rsplit(old, occurrence)
                return new.join(li)
            pre = rreplace(nameposnum, pos, '', 1)
            name = pre.strip()
            
    else:
        pass

    div = soup.find('div', class_='half-column-left')
    if div:
        searchtext = re.compile(r'Height:')
        foundtext = div.find('strong', text=searchtext)
        if foundtext:
            p = foundtext.parent
            heightweight = p.get_text()
            sep2 = '('
            post2 = heightweight.split(sep2, 1)[1]
            sep3 = 'cm)'
            ht = post2.split(sep3, 1)[0]
            weight = heightweight[-6:]
            sep4 = 'kg)'
            wt = weight.split(sep4, 1)[0]
            searchtext1 = re.compile(r'Born:')
            foundtext1 = div.find('strong', text=searchtext1)
            if foundtext1:
                p1 = foundtext1.parent
                born= p1.get_text()
                sep5 = ':'
                post5 = born.split(sep5, 1)[1]
                born1 = post5.strip()
                sep6 = ' '
                month = born1.split(sep6, 1)[0]
                month1 = months[month]
                monthdict = dict((v,k) for k,v in enumerate(calendar.month_name))
                value = monthdict[month1]
                month_dec = (float(value) - 1)/12
                sep7 = ' '
                post7 = born1.split(sep7, 1)[1]
                sep8 = ','
                day = post7.split(sep8, 1)[0]
                day_dec = (float(day) - 1)/365
                yearplus = post7.split(sep8, 1)[1]
                sep9 = ' ('
                year_raw = yearplus.split(sep9, 1)[0]
                year = year_raw.strip()
                year_dec = float(year) + month_dec + day_dec
#                 searchtext2 = re.compile(r'NCAA Season Stats - Totals')
#                 foundtext2 = soup.find('h2', text=searchtext2)
#                 if foundtext2:
#                     table = foundtext2.findNext('table')
#                     rows = table.find_all('tr')[1:]
#                     data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
#                     for row in rows:
#                         url_data.append(url)
#                         name_data.append(name)
#                         pos_data.append(pos)
#                         ht_data.append(ht)
#                         wt_data.append(wt)
#                         birthday_data.append(year_dec)
#                 else:
#                     pass
                searchtext3 = re.compile(r'International Regular Season Stats - Totals')
                foundtext3 = soup.find('h2', text=searchtext3)
                if foundtext3:
                    table = foundtext3.findNext('table')
                    rows = table.find_all('tr')[1:]
                    data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
                    for row in rows:
                        url_data.append(url)
                        name_data.append(name)
                        pos_data.append(pos)
                        ht_data.append(ht)
                        wt_data.append(wt)
                        birthday_data.append(year_dec)
                else:
                    pass
#                 searchtext4 = re.compile(r'FIBA Junior Team Events Stats')
#                 foundtext4 = soup.find('h2', text=searchtext4)
#                 if foundtext4:
#                     table = foundtext4.findNext('table')
#                     rows = table.find_all('tr')[1:]
#                     data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
#                     for row in rows:
#                         url_data.append(url)
#                         name_data.append(name)
#                         pos_data.append(pos)
#                         ht_data.append(ht)
#                         wt_data.append(wt)
#                         birthday_data.append(year_dec)
#                 else:
#                     pass
#                 searchtext5 = re.compile(r'FIBA Senior Team Events Stats')
#                 foundtext5 = soup.find('h2', text=searchtext5)
#                 if foundtext5:
#                     table = foundtext5.findNext('table')
#                     rows = table.find_all('tr')[1:]
#                     data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
#                     for row in rows:
#                         url_data.append(url)
#                         name_data.append(name)
#                         pos_data.append(pos)
#                         ht_data.append(ht)
#                         wt_data.append(wt)
#                         birthday_data.append(year_dec)
#                 else:
#                     pass
            else:
                pass
        else:
            pass
    else:
        pass
  
df = pd.DataFrame(data, columns=column_headers)
df['Name'] = name_data
df['Url'] = url_data
df['Position'] = pos_data
df['Height'] = ht_data
df['Weight'] = wt_data
df['Born'] = birthday_data
df1 = df[df['Team'] != 'All Teams']
df2 = df1[df1['League'] != 'All Leagues']
df2['Weight'] = df2['Weight'].str.replace('[^\w\s]','')
df2['Season'] = df2['Season'].str.replace('[^\w\s]','')
df2['Season'] = df2['Season'].map(lambda x: str(x)[:4])
df2['Season'] = pd.DataFrame(df2['Season'], dtype='int')
df2['Season'] += 1
df5 = df2

                                                                         



In [37]:
test_url = "http://basketball.realgm.com/player/Zoran-Dragic/Summary/24599"
html = urlopen(test_url) # get the html
soup = BeautifulSoup(html, "lxml")
searchtext = re.compile(r'FIBA Senior Team Events Stats',re.IGNORECASE)
foundtext = soup.find('h2',text=searchtext)
table = foundtext.findNext('table')
rows = table.find_all('tr')[0]
column_headers = ([th.getText() for th in rows.find_all('th')])

In [38]:
data = []
url_data = []
name_data = []
pos_data = []
ht_data = []
wt_data = []
birthday_data = []

for url in tqdm(urls):
    html = urlopen(url)  # get the html
    soup = BeautifulSoup(html, "lxml")
    span = soup.find('span', class_="feature")
    if span:
        pos = span.get_text()
    else:
        pass
    h2 = soup.find('h2')
    if h2:
        nameposnum = h2.get_text()
        sep = '#'
        if sep in nameposnum:
            pre = nameposnum.split(sep, 1)[0]
            namepos = pre.strip()
            def rreplace(s, old, new, occurrence):
                li = s.rsplit(old, occurrence)
                return new.join(li)
            pre = rreplace(namepos, pos, '', 1)
            name = pre.strip()
        else:
            def rreplace(s, old, new, occurrence):
                li = s.rsplit(old, occurrence)
                return new.join(li)
            pre = rreplace(nameposnum, pos, '', 1)
            name = pre.strip()
            
    else:
        pass

    div = soup.find('div', class_='half-column-left')
    if div:
        searchtext = re.compile(r'Height:')
        foundtext = div.find('strong', text=searchtext)
        if foundtext:
            p = foundtext.parent
            heightweight = p.get_text()
            sep2 = '('
            post2 = heightweight.split(sep2, 1)[1]
            sep3 = 'cm)'
            ht = post2.split(sep3, 1)[0]
            weight = heightweight[-6:]
            sep4 = 'kg)'
            wt = weight.split(sep4, 1)[0]
            searchtext1 = re.compile(r'Born:')
            foundtext1 = div.find('strong', text=searchtext1)
            if foundtext1:
                p1 = foundtext1.parent
                born= p1.get_text()
                sep5 = ':'
                post5 = born.split(sep5, 1)[1]
                born1 = post5.strip()
                sep6 = ' '
                month = born1.split(sep6, 1)[0]
                month1 = months[month]
                monthdict = dict((v,k) for k,v in enumerate(calendar.month_name))
                value = monthdict[month1]
                month_dec = (float(value) - 1)/12
                sep7 = ' '
                post7 = born1.split(sep7, 1)[1]
                sep8 = ','
                day = post7.split(sep8, 1)[0]
                day_dec = (float(day) - 1)/365
                yearplus = post7.split(sep8, 1)[1]
                sep9 = ' ('
                year_raw = yearplus.split(sep9, 1)[0]
                year = year_raw.strip()
                year_dec = float(year) + month_dec + day_dec
#                 searchtext2 = re.compile(r'NCAA Season Stats - Totals')
#                 foundtext2 = soup.find('h2', text=searchtext2)
#                 if foundtext2:
#                     table = foundtext2.findNext('table')
#                     rows = table.find_all('tr')[1:]
#                     data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
#                     for row in rows:
#                         url_data.append(url)
#                         name_data.append(name)
#                         pos_data.append(pos)
#                         ht_data.append(ht)
#                         wt_data.append(wt)
#                         birthday_data.append(year_dec)
#                 else:
#                     pass
#                 searchtext3 = re.compile(r'International Regular Season Stats - Totals')
#                 foundtext3 = soup.find('h2', text=searchtext3)
#                 if foundtext3:
#                     table = foundtext3.findNext('table')
#                     rows = table.find_all('tr')[1:]
#                     data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
#                     for row in rows:
#                         url_data.append(url)
#                         name_data.append(name)
#                         pos_data.append(pos)
#                         ht_data.append(ht)
#                         wt_data.append(wt)
#                         birthday_data.append(year_dec)
#                 else:
#                     pass
                searchtext4 = re.compile(r'FIBA Junior Team Events Stats')
                foundtext4 = soup.find('h2', text=searchtext4)
                if foundtext4:
                    table = foundtext4.findNext('table')
                    rows = table.find_all('tr')[1:]
                    data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
                    for row in rows:
                        url_data.append(url)
                        name_data.append(name)
                        pos_data.append(pos)
                        ht_data.append(ht)
                        wt_data.append(wt)
                        birthday_data.append(year_dec)
                else:
                    pass
                searchtext5 = re.compile(r'FIBA Senior Team Events Stats')
                foundtext5 = soup.find('h2', text=searchtext5)
                if foundtext5:
                    table = foundtext5.findNext('table')
                    rows = table.find_all('tr')[1:]
                    data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
                    for row in rows:
                        url_data.append(url)
                        name_data.append(name)
                        pos_data.append(pos)
                        ht_data.append(ht)
                        wt_data.append(wt)
                        birthday_data.append(year_dec)
                else:
                    pass
                searchtext6 = re.compile(r'Non-FIBA Events')
                foundtext6 = soup.find('h2', text=searchtext6)
                if foundtext6:
                    table = foundtext6.findNext('table')
                    rows = table.find_all('tr')[1:]
                    data += ([[td.getText() for td in rows[i].find_all('td')] for i in range(len(rows))])
                    for row in rows:
                        url_data.append(url)
                        name_data.append(name)
                        pos_data.append(pos)
                        ht_data.append(ht)
                        wt_data.append(wt)
                        birthday_data.append(year_dec)
                else:
                    pass
            else:
                pass
        else:
            pass
    else:
        pass

df = pd.DataFrame(data, columns=column_headers)
df['Name'] = name_data
df['Url'] = url_data
df['Position'] = pos_data
df['Height'] = ht_data
df['Weight'] = wt_data
df['Born'] = birthday_data
df['League'] = df['Event']
df1 = df[df['Year'] != 'AVERAGES']
df2 = df1[df1['Year'] != 'TOTAL']
df2['Season'] = df2['Year']
df2['Weight'] = df2['Weight'].str.replace('[^\w\s]','')
df2['Season'] = df2['Season'].str.replace('[^\w\s]','')
df2['Season'] = df2['Season'].map(lambda x: str(x)[:4])
df2['Season'] = pd.DataFrame(df2['Season'], dtype='int')
df2['Season'] += 1
df6 = df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [39]:
df7 = pd.concat([df4, df5, df6])
df7.to_csv('RealGMDraft.csv')

In [40]:
df7 = pd.DataFrame.from_csv('RealGMDraft.csv')
len(df7)

11648

In [41]:
df7.head()
df_NCAA = df7.loc[df7['League'] == 'NCAA']
print len(df_NCAA)

3551


In [42]:
df_NCAA.to_csv('df_NCAA.csv')