In [44]:
# imports:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import requests

# step 1 lets scrape the guided project website with all the posts:
url = "https://community.dataquest.io/c/share/guided-project/55"
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

# look for every 'a' tag with 'class' title raw-link raw-topic-link:
list_all = soup.find_all("a", class_="title raw-link raw-topic-link")

# check how many elements we've extracted:
len(list_all)

30

In [35]:
import codecs
import pandas as pd
# this is the file of the website, after scrolling all the way down:
file = codecs.open("dataquestProjects.html", "r", "utf-8")
# parse the file:
parser = BeautifulSoup(file, 'html.parser')

# look for every 'tr' tag, scrape its contents and create a pandas series from the list:
list_all = parser.find_all('tr')
series_4_df = pd.Series(list_all)

# create a dataframe from pandas series:
df = pd.DataFrame(series_4_df, columns=['content'])
df['content'] = df['content'].astype(str)
df.head()

Unnamed: 0,content
0,"<tr>\n<th>Topic</th>\n<th></th>\n<th class=""re..."
1,"<tr class=""topic-list-item"">\n<td class=""main-..."
2,"<tr class=""topic-list-item"">\n<td class=""main-..."
3,"<tr class=""topic-list-item"">\n<td class=""main-..."
4,"<tr class=""topic-list-item"">\n<td class=""main-..."


In [36]:
print(df['content'][32])
N = 32
df = df.iloc[N: , :]

<tr class="topic-list-item category-share-guided-project pinned ember-view" data-topic-id="548300" id="ember65">
<td class="main-link clearfix topic-list-data" colspan="1">
<span class="link-top-line"><div class="topic-statuses">
<span class="topic-status" title="This topic is pinned for you; it will display at the top of its category"><svg class="fa d-icon d-icon-thumbtack svg-icon pinned svg-string" xmlns="http://www.w3.org/2000/svg"><use href="#thumbtack"></use></svg></span></div>
<a aria-level="2" class="title raw-link raw-topic-link" data-topic-id="548300" href="https://community.dataquest.io/t/about-the-projects-category/548300" role="heading"><span dir="ltr">About the Projects category</span></a><span class="topic-post-badges"></span>
</span>
<div class="link-bottom-line">
</div>
</td>
<td class="posters topic-list-data">
<a class="latest single" data-user-card="nityesh" href="https://community.dataquest.io/u/nityesh"><img alt="" aria-label="nityesh - Original Poster, Most Recen

In [37]:
# remove 1st row:
df = df.iloc[1:,:]
# extract title, link and number of replies:
df['title'] = df['content'].str.extract('<span dir="ltr">(.*?)</span>')
df['link'] = df['content'].str.extract('href=\"(.*?)\" role')
df['replies'] = df['content'].str.extract("This topic has (.*) re").astype(int)
df['views'] = df['content'].str.extract("this topic has been viewed (.*?) times")
df['views'] = df['views'].str.replace(',','').astype(int)

# remove 1 generic post and posts with 0 replies:
df = df[df['replies']>0]
df = df[df['replies']<100]
df.head()


Unnamed: 0,content,title,link,replies,views
35,"<tr class=""topic-list-item category-share-guid...",Sharing: “Analyzing CIA Factbook Data Using SQL”,https://community.dataquest.io/t/sharing-analy...,1,9
36,"<tr class=""topic-list-item category-share-guid...",Sharing my Hacker News guided project,https://community.dataquest.io/t/sharing-my-ha...,2,55
37,"<tr class=""topic-list-item category-share-guid...",Guided Project 4: Exploring eBay Car Sales Data,https://community.dataquest.io/t/guided-projec...,6,308
38,"<tr class=""topic-list-item category-share-guid...",Guided Project: Heavy Traffic Indicators I-94,https://community.dataquest.io/t/guided-projec...,1,48
39,"<tr class=""topic-list-item category-share-guid...",Guided Project: Prison Break by oladejoalexander3,https://community.dataquest.io/t/guided-projec...,1,20


In [38]:
df['link'][36]

'https://community.dataquest.io/t/sharing-my-hacker-news-guided-project/561256'

In [45]:
# create a function for scraping the actual posts website:
def get_reply(one_link):
    response = requests.get(one_link)
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    tag_numbers = parser.find_all("div", class_="post")
    # we're only going to scrape the content of the first reply (that's usually the feedback)
    feedback = tag_numbers[1].text
    return feedback

In [46]:
df.head(5)

Unnamed: 0,content,title,link,replies,views
35,"<tr class=""topic-list-item category-share-guid...",Sharing: “Analyzing CIA Factbook Data Using SQL”,https://community.dataquest.io/t/sharing-analy...,1,9
36,"<tr class=""topic-list-item category-share-guid...",Sharing my Hacker News guided project,https://community.dataquest.io/t/sharing-my-ha...,2,55
37,"<tr class=""topic-list-item category-share-guid...",Guided Project 4: Exploring eBay Car Sales Data,https://community.dataquest.io/t/guided-projec...,6,308
38,"<tr class=""topic-list-item category-share-guid...",Guided Project: Heavy Traffic Indicators I-94,https://community.dataquest.io/t/guided-projec...,1,48
39,"<tr class=""topic-list-item category-share-guid...",Guided Project: Prison Break by oladejoalexander3,https://community.dataquest.io/t/guided-projec...,1,20


In [47]:
# create a test dataframe to test scraping on 5 rows:
df_test = df[:5]

In [48]:
df_test.head(3)

Unnamed: 0,content,title,link,replies,views
35,"<tr class=""topic-list-item category-share-guid...",Sharing: “Analyzing CIA Factbook Data Using SQL”,https://community.dataquest.io/t/sharing-analy...,1,9
36,"<tr class=""topic-list-item category-share-guid...",Sharing my Hacker News guided project,https://community.dataquest.io/t/sharing-my-ha...,2,55
37,"<tr class=""topic-list-item category-share-guid...",Guided Project 4: Exploring eBay Car Sales Data,https://community.dataquest.io/t/guided-projec...,6,308


In [53]:


# we'll use a loop on all the elements of pd.Series (faster than using 'apply')
feedback_list = []
for el in df_test['link']:
    feedback_list.append(get_reply(el))
    # print(feedback_list)
df_test['feedback'] = feedback_list
df_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['feedback'] = feedback_list


Unnamed: 0,content,title,link,replies,views,feedback
35,"<tr class=""topic-list-item category-share-guid...",Sharing: “Analyzing CIA Factbook Data Using SQL”,https://community.dataquest.io/t/sharing-analy...,1,9,\nwow that’s awesome\nthis way is much biter t...
36,"<tr class=""topic-list-item category-share-guid...",Sharing my Hacker News guided project,https://community.dataquest.io/t/sharing-my-ha...,2,55,\nHey Alvaro!\nWelcome to the community and co...
37,"<tr class=""topic-list-item category-share-guid...",Guided Project 4: Exploring eBay Car Sales Data,https://community.dataquest.io/t/guided-projec...,6,308,\nHi @israelogunmola\nWhat another well detail...
38,"<tr class=""topic-list-item category-share-guid...",Guided Project: Heavy Traffic Indicators I-94,https://community.dataquest.io/t/guided-projec...,1,48,"\nHey @rakhshanda_kaleem, congratulations for ..."
39,"<tr class=""topic-list-item category-share-guid...",Guided Project: Prison Break by oladejoalexander3,https://community.dataquest.io/t/guided-projec...,1,20,\nHi @oladejoalexander3\nWelcome to the commun...


In [54]:
def scrape_replies(df):
    feedback_list = []
    for el in df['link']:
        feedback_list.append(get_reply(el))
    df['feedback'] = feedback_list
    return df

df = scrape_replies(df)

In [57]:
df.head(4)

Unnamed: 0,content,title,link,replies,views,feedback
35,"<tr class=""topic-list-item category-share-guid...",Sharing: “Analyzing CIA Factbook Data Using SQL”,https://community.dataquest.io/t/sharing-analy...,1,9,\nwow that’s awesome\nthis way is much biter t...
36,"<tr class=""topic-list-item category-share-guid...",Sharing my Hacker News guided project,https://community.dataquest.io/t/sharing-my-ha...,2,55,\nHey Alvaro!\nWelcome to the community and co...
37,"<tr class=""topic-list-item category-share-guid...",Guided Project 4: Exploring eBay Car Sales Data,https://community.dataquest.io/t/guided-projec...,6,308,\nHi @israelogunmola\nWhat another well detail...
38,"<tr class=""topic-list-item category-share-guid...",Guided Project: Heavy Traffic Indicators I-94,https://community.dataquest.io/t/guided-projec...,1,48,"\nHey @rakhshanda_kaleem, congratulations for ..."


In [59]:
df.to_csv('dataquest.csv')