## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [None]:
# $ python3 -m venv venv
# $ . ./venv/bin/activate

In [1]:
#Better
!pip install requests BeautifulSoup4 fire



In [2]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.common.by import By
from itertools import groupby 

import pandas as pd
import os, sys
import re
import requests
import csv

import fire

In [3]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('finding',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('finding all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

In [4]:
url=('https://africafreak.com/100-most-influential-twitter-users-in-africa')
response=simple_get(url)
res=get_elements(response, tag='h2', search={'find_all': {'class_': 'twitter-tweet'}})
str_cells=str(res)
cleantext2=BeautifulSoup(str_cells,"lxml").get_text()
#print(cleantext2)

finding all of {'class_': 'twitter-tweet'}


In [6]:
dataa=[]
for item in res:
    if '(@' in item:
        cleant = re.compile(r'@(\w*)')
        cleant2=(re.search(cleant,item))        
        if cleant2.group(0) is not None:
            dataa.append({'value': cleant2.group(0) })        
df=pd.DataFrame(dataa)
df.columns = ["Top 10 African Influencers twitter Handles"]
df1=df.sort_index(ascending=0)
df2=df1.head(10)
df2.to_csv(r'african_influencers.csv', index = False)
df2

#print(df1.head(10).to_csv(r'african_influencers.csv'))



Unnamed: 0,Top 10 African Influencers twitter Handles
99,@Trevornoah
98,@GarethCliff
97,@SAPresident
96,@News24
95,@Julius_S_Malema
94,@helenzille
93,@mailandguardian
92,@5FM
91,@loyisogola
90,@Computicket


In [7]:
url2= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = simple_get(url2)

In [8]:
res1 = get_elements(response, search={'find_all':{'class_':'twitter-tweet'}})
#res1
str_cells2=str(res1)
cleantext=BeautifulSoup(str_cells2,"lxml").get_text()
#print(cleantext.split(""))
data=[]
for item in cleantext.split("20,"):
    clean=re.compile(r'@(\w*)')
    clean2=re.search(clean,item)
    if clean2 is not None:
        data.append({'value': clean2.group(0) })
    
df=pd.DataFrame(data)
df.columns = ["Twitter Handles of 10 African leaders"]
df1=df.head(10)
df1.to_csv(r'african_leaders.csv', index = False)
df1

#print(df1.head(10).to_csv(r'african_influencers.csv'))

finding all of {'class_': 'twitter-tweet'}


Unnamed: 0,Twitter Handles of 10 African leaders
0,@EswatiniGovern1
1,@MalawiGovt
2,@hagegeingob
3,@FinanceSC
4,@PresidencyZA
5,@Dora_Siliya
6,@edmnangagwa
7,@MinSantedj
8,@hawelti
9,@StateHouseKenya


## Web scrapping using bash script
If the web site has a quite simple HTML, you can easily use curl to perform the request and then extract the needed values using bash commands grep, cut , sed, ..

This tutorial is adapted from [this](https://medium.com/@LiliSousa/web-scraping-with-bash-690e4ee7f98d) medium article

In [9]:
# %%bash 

# # curl the page and save content to tmp_file
# #url = "https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa"
# #curl -X GET $url -o tmp_file


# #!/bin/bash

# # write headers to CSV file
# echo "Name, twitter_id" >> extractData.csv
# n="1"
# while [ $n -lt 2 ]
# do
  
#   #get title
#   title=$(cat tmp_file | grep "class=\"twitter-tweet\"" | cut -d ';' -f1 )
#   echo $title
#   #get author
#   #twitter_id=$(cat tmp_file |grep -A1 "class=\"css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0\"" | tail -1)

#   #echo "$title, $twitter_id" >> extractData.csv
#   #echo "$title, $twitter_id"
    
#   n=$[$n+1]

# done