# Data Crawling

In [6]:
import pandas as pd
import matplotlib.pyplot as plt

## Let's learn some basic *HTML*!!

...

## Web Crawling With `BeautifulSoup4`

In [7]:
import requests
html = requests.get('https://stackoverflow.com/questions?tab=newest&pagesize=50&page=1')
html.text[:3000]        # A gigantic messy string. Extremely difficult to interact with.



In [8]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html.text)

## Finding Elements

In [9]:
all_links = soup.find_all('a')
print(all_links[5].text)


Stack Overflow
Public questions & answers



In [10]:
all_question_boxes = soup.find_all('div', attrs={'class': 's-post-summary'})
print(f'Number of question boxes in page: {len(all_question_boxes)}')
print('-------------------------------------------------------------')
print(all_question_boxes[1])

Number of question boxes in page: 50
-------------------------------------------------------------
<div class="s-post-summary js-post-summary" data-post-id="73362667" data-post-type-id="1" id="question-summary-73362667">
<div class="s-post-summary--stats js-post-summary-stats">
<div class="s-post-summary--stats-item s-post-summary--stats-item__emphasized" title="Score of 0">
<span class="s-post-summary--stats-item-number">0</span>
<span class="s-post-summary--stats-item-unit">votes</span>
</div>
<div class="s-post-summary--stats-item" title="0 answers">
<span class="s-post-summary--stats-item-number">0</span>
<span class="s-post-summary--stats-item-unit">answers</span>
</div>
<div class="s-post-summary--stats-item" title="3 views">
<span class="s-post-summary--stats-item-number">3</span>
<span class="s-post-summary--stats-item-unit">views</span>
</div>
</div>
<div class="s-post-summary--content">
<h3 class="s-post-summary--content-title">
<a class="s-link" href="/questions/73362667/e11

**Extracting Question Titles**

In [11]:
all_question_title_elements = soup.select('.s-post-summary--content-title > a')     # Notice how we accessed specific tags using CSS selectors
all_question_titles = [element.text for element in all_question_title_elements]
all_question_titles[:10]

['Slice multidimensional array',
 'E11000 duplicate key error collection when create',
 "Print all possible combination of words of length 10 from a list letters with repeating 'A' exactly twice",
 'I want to use the datetimepicker, but how do I make it Korean time when I input the time?',
 'How to get the count of repeated values in PowerShell from an Excel Sheet',
 'Replace the value located between two commas in Regex python',
 'Input Domain field on NTLM authentication with Azure/go-ntlmssp package',
 'How can I bind the curl process to submit when the button is clicked?',
 "Spring Boot app doesn't read configuration from Consul if started before Consul",
 'Asking for an input for no reason']

**Extracting Question Excerpts**

In [12]:
all_question_excerpt_elements = soup.find_all('div', 's-post-summary--content-excerpt')
all_question_excerpts = [element.text for element in all_question_excerpt_elements]
all_question_excerpts[:5]

['\r\n                I have an array (images_lst) having shape (250,500,500), it is basically a list of 250 images having dimensions 500X500. How do I select only the first dimension of the array to use it in a loop given ...\r\n            ',
 "\r\n                I'm developing a chat system with typescript and mongodb (using mongoose)\nWhen creating new message I get the following error:\n\nE11000 duplicate key error collection: test.messages index: user....\r\n            ",
 "\r\n                I have a list of 5 letters ['A', 'B', 'N', 'M','E'].\nI want to print all the words of length 10 letters that have exactly two letters A.\nOrder is important.\nI have tried with itertools.product as it ...\r\n            ",
 '\r\n                <DateTimePicker\n            label="시작일"\n            value={startValue + 1}\n            type=" date"\n            inputFormat={\'yyyy/MM/dd  HH:mm\'}\n            locale={ko}\n        ...\r\n            ',
 '\r\n                Consider this is 

**Extracting Username**

In [13]:
all_question_username_elements = soup.select('.s-user-card--info > .s-user-card--link > a')
all_question_usernames = [element.text for element in all_question_username_elements]
all_question_usernames[:10]

['vashista',
 'user4554890',
 'Anand_is_goat',
 '김도헌',
 'LakshmanPillai',
 'user19640240',
 'JayRain',
 'kennedyas',
 'ka3ak',
 'epic']

**Extracting Question ID**

In [14]:
id_elements = soup.select('.s-post-summary--content > .s-post-summary--content-title > a')
all_question_ids = [element['href'].split('/')[2] for element in id_elements]
all_question_ids[:10]

['73362668',
 '73362667',
 '73362666',
 '73362665',
 '73362664',
 '73362663',
 '73362662',
 '73362658',
 '73362657',
 '73362656']

**Extracting Post Stats**

In [15]:
stats_element = soup.select('.s-post-summary--stats')
questions_upvotes = []
questions_answers = []
questions_views = []
for el in stats_element:
    children = list(el.findChildren('div'))
    questions_upvotes.append(
        children[0].find('span').text
    )
    questions_answers.append(
        children[1].find('span').text
    )
    questions_views.append(
        children[2].find('span').text
    )
print(questions_upvotes[:10])
print(questions_answers[:10])
print(questions_views[:10])

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['4', '3', '3', '2', '3', '2', '2', '5', '4', '9']


**Extracting Question Meta Tags**

In [16]:
meta_tags_container = soup.find_all('div', 's-post-summary--meta-tags')
all_question_meta_tags = []
for question_tag_container in meta_tags_container:
    temp = []
    for child in question_tag_container.findChildren('a'):
        temp.append(child.text)
    all_question_meta_tags.append(temp)

all_question_meta_tags[:5]

[['python', 'loops', 'multidimensional-array', 'slice'],
 ['typescript', 'mongodb', 'mongoose'],
 ['python', 'python-3.x', 'string', 'list', 'itertools'],
 ['javascript'],
 ['excel', 'powershell']]

## Creating Our Dataframe

In [17]:
from itertools import chain

questions_info_df = pd.DataFrame({
    'id': all_question_ids,
    'title': all_question_titles,
    'excerpt': all_question_excerpts,
    'username': all_question_usernames,
    'upvotes': questions_upvotes,
    'views': questions_views,
    'answers': questions_answers,
})

question_ids_multiplied = list(chain(
    *[[id]*len(meta_tags) for id, meta_tags in zip(all_question_ids, all_question_meta_tags)]
))      # Don't freak out!!! It's easy!!

meta_tags_df = pd.DataFrame({
    'id': question_ids_multiplied,
    'meta_tag': list(chain(*all_question_meta_tags))
})

In [18]:
questions_info_df.head()

Unnamed: 0,id,title,excerpt,username,upvotes,views,answers
0,73362668,Slice multidimensional array,\r\n I have an array (images_ls...,vashista,0,4,0
1,73362667,E11000 duplicate key error collection when create,\r\n I'm developing a chat syst...,user4554890,0,3,0
2,73362666,Print all possible combination of words of len...,\r\n I have a list of 5 letters...,Anand_is_goat,0,3,0
3,73362665,"I want to use the datetimepicker, but how do I...",\r\n <DateTimePicker\n ...,김도헌,0,2,0
4,73362664,How to get the count of repeated values in Pow...,\r\n Consider this is an Excel ...,LakshmanPillai,0,3,0


In [19]:
meta_tags_df.head(10)

Unnamed: 0,id,meta_tag
0,73362668,python
1,73362668,loops
2,73362668,multidimensional-array
3,73362668,slice
4,73362667,typescript
5,73362667,mongodb
6,73362667,mongoose
7,73362666,python
8,73362666,python-3.x
9,73362666,string


## Putting It All Together

In [20]:
def crawl_stackoverflow(pagenumber):
    html = requests.get(f'https://stackoverflow.com/questions?tab=newest&pagesize=50&page={pagenumber}')
    soup = BeautifulSoup(html.text)
    # ---- Titles ---- #
    all_question_title_elements = soup.select('.s-post-summary--content-title > a')
    all_question_titles = [element.text for element in all_question_title_elements]

    # ---- Excerpts ---- #
    all_question_excerpt_elements = soup.find_all('div', 's-post-summary--content-excerpt')
    all_question_excerpts = [element.text for element in all_question_excerpt_elements]

    # ---- Usernames ---- #
    all_question_username_elements = soup.select('.s-user-card--info > .s-user-card--link > a')
    all_question_usernames = [element.text for element in all_question_username_elements]

    # ---- Question IDs ---- #
    id_elements = soup.select('.s-post-summary--content > .s-post-summary--content-title > a')
    all_question_ids = [element['href'].split('/')[2] for element in id_elements]

    # ---- Question Stats ---- #
    stats_element = soup.select('.s-post-summary--stats')
    questions_upvotes = []
    questions_answers = []
    questions_views = []
    for el in stats_element:
        children = list(el.findChildren('div'))
        questions_upvotes.append(
            children[0].find('span').text
        )
        questions_answers.append(
            children[1].find('span').text
        )
        questions_views.append(
            children[2].find('span').text
        )

    # ---- Question Meta Tags ---- #
    meta_tags_container = soup.find_all('div', 's-post-summary--meta-tags')
    all_question_meta_tags = []
    for question_tag_container in meta_tags_container:
        temp = []
        for child in question_tag_container.findChildren('a'):
            temp.append(child.text)
        all_question_meta_tags.append(temp)


    # ---- Creating Dataframes ---- #
    questions_info_df = pd.DataFrame({
        'id': all_question_ids,
        'title': all_question_titles,
        'excerpt': all_question_excerpts,
        'username': all_question_usernames,
        'upvotes': questions_upvotes,
        'views': questions_views,
        'answers': questions_answers,
    })

    question_ids_multiplied = list(chain(
        *[[id]*len(meta_tags) for id, meta_tags in zip(all_question_ids, all_question_meta_tags)]
    ))      # Don't freak out!!! It's easy!!

    meta_tags_df = pd.DataFrame({
        'id': question_ids_multiplied,
        'meta_tag': list(chain(*all_question_meta_tags))
    })

    return questions_info_df, meta_tags_df
    

In [21]:
from tqdm import tqdm

n_pages = 100
question_info_df = []
meta_tags_df = []

for p in tqdm(range(1, n_pages+1)):
    q_df, mt_df = crawl_stackoverflow(p)
    question_info_df.append(q_df.copy())        # duplicated dataframes!!!
    meta_tags_df.append(mt_df.copy())
    # print(q_df.loc[:10, 'id'])
question_info_df = pd.concat(question_info_df)
meta_tags_df = pd.concat(meta_tags_df)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:49<00:00,  1.09s/it]


In [22]:
print(question_info_df.shape)
question_info_df.head()

(5000, 7)


Unnamed: 0,id,title,excerpt,username,upvotes,views,answers
0,73362668,Slice multidimensional array,\r\n I have an array (images_ls...,vashista,0,4,0
1,73362667,E11000 duplicate key error collection when create,\r\n I'm developing a chat syst...,user4554890,0,3,0
2,73362666,Print all possible combination of words of len...,\r\n I have a list of 5 letters...,Anand_is_goat,0,3,0
3,73362665,"I want to use the datetimepicker, but how do I...",\r\n <DateTimePicker\n ...,김도헌,0,2,0
4,73362664,How to get the count of repeated values in Pow...,\r\n Consider this is an Excel ...,LakshmanPillai,0,3,0


In [23]:
print(meta_tags_df.shape)
meta_tags_df.head()

(15195, 2)


Unnamed: 0,id,meta_tag
0,73362668,python
1,73362668,loops
2,73362668,multidimensional-array
3,73362668,slice
4,73362667,typescript


In [24]:
print(question_info_df.duplicated().sum())
print(meta_tags_df.duplicated().sum())

50
153


In [25]:
question_info_df = question_info_df.drop_duplicates()
meta_tags_df = meta_tags_df.drop_duplicates()

In [26]:
print(question_info_df.excerpt.values[0])
question_info_df.excerpt = question_info_df.excerpt.replace(r'^\r\n\s+', '', regex=True) \
                                                   .replace(r'\r\n\s+$', '', regex=True)


                I have an array (images_lst) having shape (250,500,500), it is basically a list of 250 images having dimensions 500X500. How do I select only the first dimension of the array to use it in a loop given ...
            


In [27]:
question_info_df.dtypes     # <-- we need to do some cleaning

id          object
title       object
excerpt     object
username    object
upvotes     object
views       object
answers     object
dtype: object

In [28]:
question_info_df[['upvotes', 'views', 'answers']] = question_info_df[['upvotes', 'views', 'answers']].astype('int')
question_info_df.dtypes

id          object
title       object
excerpt     object
username    object
upvotes      int32
views        int32
answers      int32
dtype: object

In [29]:
question_info_df = question_info_df.set_index('id')
meta_tags_df = meta_tags_df.set_index('id')

In [30]:
question_info_df.to_csv('question_info.csv')
meta_tags_df.to_csv('meta_tags.csv')

## Some Data Analysis

In [31]:
popular_tags = meta_tags_df.value_counts().sort_values()[::-1]
popular_tags[:10]

meta_tag  
python        813
javascript    573
reactjs       317
java          260
c#            234
html          207
android       194
node.js       168
flutter       157
              154
dtype: int64