In [1]:
from urllib.error import HTTPError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
import pandas as pd

# Create archive links

In [2]:
def create_archive_links(year_start, year_end, month_start, month_end, day_start, day_end):
    archive_links = {}
    for y in range(year_start, year_end + 1):
        dates = [str(d).zfill(2) + "-" + str(m).zfill(2) + "-" +
                 str(y) for m in range(month_start, month_end + 1) for d in
                 range(day_start, day_end + 1)]
        archive_links[y] = [
            "https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates]
    return archive_links

In [3]:
#create_archive_links(2006,2020,1, 12, 1, 31)
archive_links = create_archive_links(2000,2020,1, 12, 1, 31)

# Scrap

In [None]:
def get_articles_links(archive_links):
  '''Each article is in a <section> having a class named teaser and here 
  I also filter all the non free articles having a span with class icon__premium. 
  All the links containing the word en-direct are also filtered because they are videos. '''
  links_non_abonne = []
  for link in archive_links:
      try:
          html = urlopen(link)
      except HTTPError as e:
          print("url not valid", link)
      else:
          soup = BeautifulSoup(html, "html.parser")
          news = soup.find_all(class_="teaser")
          # condition here : if no span icon__premium (abonnes)
          for item in news:
              if not item.find('span', {'class': 'icon__premium'}):
                  l_article = item.find('a')['href']
                  # en-direct = video
                  if 'en-direct' not in l_article:
                      links_non_abonne.append(l_article)
  return links_non_abonne

In [None]:
def get_single_page(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print("url not valid", url)
    else:
        soup = BeautifulSoup(html, "html.parser")
        try:
          text_title = soup.find('h1')
        except:
          text_title = 'empty' 
        try:
          text_body = soup.article.find_all(["p", "h2"], recursive=False)
        except:
          text_body = 'empty'
        try:
          tag = soup.findAll('li',attrs={'class':'old__nav-content-list-item'})
        except:
          tag = 'empty'
        return [text_title, text_body,tag]

In [None]:
df = pd.DataFrame(columns=['Year', 'Html'])

In [None]:
for year,links in archive_links.items():
    print("processing: ",year)
    article_links_list = get_articles_links(links)
    temp = pd.DataFrame({'Year': [year]*len(article_links_list), 'Html': article_links_list})
    df = df.append(temp)

processing:  2000
processing:  2001
processing:  2002
processing:  2003
processing:  2004
processing:  2005
processing:  2006
processing:  2007
processing:  2008
processing:  2009
processing:  2010
processing:  2011
processing:  2012
processing:  2013


In [None]:
df.head()

In [None]:
import swifter
df['out'] = df['Html'].swifter.apply(get_single_page)

In [None]:
df.to_csv('out.csv',index=False)

In [None]:
html = urlopen('https://www.lemonde.fr/ameriques/article/2006/01/01/les-zapatistes-lancent-une-autre-campagne-a-six-mois-de-la-presidentielle-mexicaine_726244_3222.html')
soup = BeautifulSoup(html, "html.parser")
soup.findAll('li',attrs={'class':'old__nav-content-list-item'})

In [None]:
def keep(x,nbr):
  if len(x)>nbr:
    retour=1
  else:
    retour=0
  return retour

In [None]:
df['Titre_OK'] = df['Titre'].apply(lambda x: keep(x,10))
df['Body_OK'] = df['Body'].apply(lambda x: keep(x,10))

In [None]:
df['Body_OK'].sum()/len(df['Body'])*100