# INTRODUÇÃO À WEB SCRAPING

## Importar as Bibliotecas

In [28]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

Página que vamos fazer o scraping:
https://statisticstimes.com/tech/top-computer-languages.php

## Coletar a Página

In [29]:
html = requests.get("https://statisticstimes.com/tech/top-computer-languages.php").content
soup = BeautifulSoup(html,"html5lib")

## Com a biblioteca Beautifulsoup vamos encontrar o primeiro parágrafo, pegar a tag P

In [30]:
primeiro_paragrafo = soup.find("p")
primeiro_paragrafo

<p>Python is the top programming language in TIOBE and PYPL Index. Python has taken a lead of over 6% from C 
in TIOBE. In PYPL, a gap is much wider as top-ranked Python is ahead close to 12.4% from 2nd ranked Java.</p>

In [31]:
primeiro_paragrafo.text

'Python is the top programming language in TIOBE and PYPL Index. Python has taken a lead of over 6% from C \nin TIOBE. In PYPL, a gap is much wider as top-ranked Python is ahead close to 12.4% from 2nd ranked Java.'

## Agora vamos pegar todos os parágrafos da página (veja a função find_all)

In [32]:
todos_paragrafos = soup.find_all("p")
todos_paragrafos
todos_paragrafos[0]
todos_paragrafos[0].text

'Python is the top programming language in TIOBE and PYPL Index. Python has taken a lead of over 6% from C \nin TIOBE. In PYPL, a gap is much wider as top-ranked Python is ahead close to 12.4% from 2nd ranked Java.'

## Agora vamos pegar todos os links

In [33]:
todos_links = soup.find_all("a")
todos_links
todos_links[0]

<a href="/index.php">
<img alt="statisticstimes" height="90" src="/statistics.png" width="200"/>
</a>

## Vamos inspecionar a tabela, pra pegar os dados da tabela "PYPL Index (Worldwide)"

In [34]:
tabela = soup.find("table", {"id":"table_id1"}).find("tbody")
tabela

<tbody>
<tr><td class="data1">1</td><td class="data1"></td><td class="name">Python</td><td class="data1"> 28.43 %</td><td class="data1">+0.7 %</td></tr>
<tr><td class="data1">2</td><td class="data1"></td><td class="name">Java</td><td class="data1"> 16.04 %</td><td class="data1">-0.1 %</td></tr>
<tr><td class="data1">3</td><td class="data1"></td><td class="name">JavaScript</td><td class="data1"> 8.72 %</td><td class="data1">-0.8 %</td></tr>
<tr><td class="data1">4</td><td class="data1">↑</td><td class="name">C/C++</td><td class="data1"> 6.65 %</td><td class="data1">+0.2 %</td></tr>
<tr><td class="data1">5</td><td class="data1">↓</td><td class="name">C#</td><td class="data1"> 6.63 %</td><td class="data1">-0.2 %</td></tr>
<tr><td class="data1">6</td><td class="data1">↑</td><td class="name">R</td><td class="data1"> 4.63 %</td><td class="data1">+0.2 %</td></tr>
<tr><td class="data1">7</td><td class="data1">↓</td><td class="name">PHP</td><td class="data1"> 4.45 %</td><td class="data1">-0.7 %

In [35]:
linhas = tabela.find_all("tr")
for linha in linhas:
    dado = linha.find_all("td")
    print(dado[0].text)
    print(dado[2].text)
    print(dado[3].text)
    print("-----")

1
Python
 28.43 %
-----
2
Java
 16.04 %
-----
3
JavaScript
 8.72 %
-----
4
C/C++
 6.65 %
-----
5
C#
 6.63 %
-----
6
R
 4.63 %
-----
7
PHP
 4.45 %
-----
8
TypeScript
 2.96 %
-----
9
Swift
 2.71 %
-----
10
Rust
 2.53 %
-----
11
Objective-C
 2.43 %
-----
12
Go
 2.16 %
-----
13
Kotlin
 1.93 %
-----
14
Matlab
 1.54 %
-----
15
Dart
 1.01 %
-----
16
Ada
 0.99 %
-----
17
Ruby
 0.97 %
-----
18
VBA
 0.91 %
-----
19
Powershell
 0.76 %
-----
20
Lua
 0.61 %
-----
21
Abap
 0.6 %
-----
22
Scala
 0.58 %
-----
23
Visual Basic
 0.42 %
-----
24
Groovy
 0.35 %
-----
25
Julia
 0.33 %
-----
26
Perl
 0.22 %
-----
27
Haskell
 0.17 %
-----
28
Cobol
 0.14 %
-----
29
Delphi/Pascal
 0.13 %
-----


## Montar uma lista de linguagens de programação e de porcentagem com os dados, para depois compor o DataFrame

In [36]:
linguagem = []
pontos = []
for linha in linhas:
    dado = linha.find_all("td")
    linguagem.append(dado[2].text)
    pontos.append(dado[3].text)

In [37]:
print(linguagem)

['Python', 'Java', 'JavaScript', 'C/C++', 'C#', 'R', 'PHP', 'TypeScript', 'Swift', 'Rust', 'Objective-C', 'Go', 'Kotlin', 'Matlab', 'Dart', 'Ada', 'Ruby', 'VBA', 'Powershell', 'Lua', 'Abap', 'Scala', 'Visual Basic', 'Groovy', 'Julia', 'Perl', 'Haskell', 'Cobol', 'Delphi/Pascal']


In [38]:
print(pontos)

[' 28.43 %', ' 16.04 %', ' 8.72 %', ' 6.65 %', ' 6.63 %', ' 4.63 %', ' 4.45 %', ' 2.96 %', ' 2.71 %', ' 2.53 %', ' 2.43 %', ' 2.16 %', ' 1.93 %', ' 1.54 %', ' 1.01 %', ' 0.99 %', ' 0.97 %', ' 0.91 %', ' 0.76 %', ' 0.61 %', ' 0.6 %', ' 0.58 %', ' 0.42 %', ' 0.35 %', ' 0.33 %', ' 0.22 %', ' 0.17 %', ' 0.14 %', ' 0.13 %']


In [39]:
dados = pd.DataFrame(linguagem, columns=["Linguagem"])
dados["Pontos"] = pontos
dados

Unnamed: 0,Linguagem,Pontos
0,Python,28.43 %
1,Java,16.04 %
2,JavaScript,8.72 %
3,C/C++,6.65 %
4,C#,6.63 %
5,R,4.63 %
6,PHP,4.45 %
7,TypeScript,2.96 %
8,Swift,2.71 %
9,Rust,2.53 %
