In [4]:
import os
import re
from bs4 import BeautifulSoup
import requests
import json

In [7]:
articles_dir = 'Articles'
annotation_dir = 'Annotations'

if not os.path.exists(articles_dir):
    os.makedirs(path)

if not os.path.exists(annotation_dir):
    os.makedirs(path)

## Download all the articles

In [None]:
overview_pages = ['https://www.nytimes.com/interactive/2014/12/29/us/year-in-interactive-storytelling.html',
 'https://www.nytimes.com/interactive/2015/us/year-in-interactive-storytelling.html',
 'https://www.nytimes.com/interactive/2016/12/28/us/year-in-interactive-graphics.html',
 'https://www.nytimes.com/interactive/2017/12/21/us/2017-year-in-graphics.html',
 'https://www.nytimes.com/interactive/2018/us/2018-year-in-graphics.html',
 'https://www.nytimes.com/interactive/2019/12/30/us/2019-year-in-graphics.html',
 'https://www.nytimes.com/interactive/2020/12/30/us/2020-year-in-graphics.html',
 'https://www.nytimes.com/interactive/2021/12/29/us/2021-year-in-graphics.html',
 'https://www.nytimes.com/interactive/2022/12/28/us/2022-year-in-graphics.html']

In [None]:
overview_pages = overview_pages[-2:]

all_links = []
for page in overview_pages:
    response = requests.get(page)
    if response:
        html_page = response.text
        soup = BeautifulSoup( html_page ,"lxml")
        links = soup.find_all('a')
        for l in links: 
            url = l.get('href')
            if re.search( r'20\d{2}' , url ) and not(re.search('year-in-graphics',url)):
                all_links.append(url)

all_links = list(set(all_links))

In [None]:
for article in all_links:
    name =  os.path.basename(article)
    #name = re.sub( '\.html$' , '' , name)
    path = os.path.join( articles_dir , name )
    with open( path , 'w' , encoding = 'utf-8') as out:
        response = requests.get(article)
        if response:
            out.write(response.text)
    

## Identification of modalities

In [5]:
def get_attribute_value(attribute):
    attributes = element.get(attribute)
    if type(attributes) == list:
        return ' '.join(attributes)
    else:
        return attributes
    
def analyse_transitions(components):
    transitions = []
    current = ''
    for c in components:
        if c != current:
            transitions.append(c)
            current = c
    return transitions
    

In [8]:
html_files = os.listdir(articles_dir)
html_files = [html_file for html_file in html_files if re.search(r'html$',html_file) ]

html_files.remove('wordle-bot.html')
html_files.remove('index.html')

print( f'The dataset contains {len(html_files)} articles.' )

The dataset contains 245 articles.


In [24]:
annotations = []

article_id = 0 

for i,file in enumerate(html_files):
    
    article_id += 1
    
    article_title = re.sub( r'\.html' , '' , file )
    article_title = re.sub( r'[_-]' , ' ' , article_title )
    article_title = article_title.title()
    article_publisher='New York Times'

    
    #print(file)
    html_page = ''
    path = os.path.join(articles_dir,file)
    with open(path,encoding='utf-8') as fh:
        html_page = fh.read()

        
    if len(html_page)>0:
        try:

            # identify components
            components = []

            soup = BeautifulSoup( html_page ,"lxml")

            meta = soup.find_all('meta')


            for m in meta:
                m_content = m.get('content')
                c_attr = re.findall('20\d{2}' , str(m_content) )
                if len(c_attr) > 0:
                    year_publication = c_attr[0]


            body = soup.find('body' )

            for element in body.findChildren():

                if element.name == 'p':

                    attribute_value = get_attribute_value('class')
                    if attribute_value == 'g-body':
                        #print(f'Body text:\n{element.text.strip()}')
                        components.append( ('text-flow',element.text.strip()))
                         
                    elif attribute_value == 'g-fg':
                        #print(f'Scroll text:\t{element.text.strip()}\n')
                        components.append( ('dynamic-text',element.text.strip())) 
                    elif re.search( 'g-cue' , str(attribute_value) ):
                        components.append( ('subtitle',element.text.strip()))
                    elif re.search( 'svelte-1lru8e5' , str(attribute_value) ):
                        components.append( ('text-flow',element.text.strip()))
                        
                    if re.search( r'^scroll\b' , element.text.strip() , re.IGNORECASE ):
                        components.append( ('scroll',element.text.strip()))

                    if re.search( r'^continue\s+reading\b' , element.text.strip() , re.IGNORECASE ):
                        components.append( ('scroll',element.text.strip()))

                if element.name == 'img':

                    attribute_value = get_attribute_value('class')

                    if re.search( r'g-overlay-play', str(attribute_value) ):
                        components.append( ('dynamic-image-flow',element.text.strip()))
                        
                    else:
                        components.append( ('static-image',element.text.strip()))


                        
                if element.name == 'span':
                    
                    attribute_value = get_attribute_value('class')
                    
                    if re.search( r'audio-player-icon', str(attribute_value) ):
                        #print('Scroll \n')
                        components.append( ('button',element.text.strip()))


                if element.name == 'button':
                    
                    components.append( ('button',element.text.strip()))
                        
                        
                if element.name == 'div':

                    attribute_value = get_attribute_value('class')

                    if re.search( r'g-scroll-to-continue', str(attribute_value) ):
                        #print('Scroll \n')
                        components.append( ('scroll',element.text.strip()))

                    elif re.search( r'g-video', str(attribute_value) ):
                        components.append( ('dynamic-image-flow',element.text.strip()))

                    elif re.search( r'g-audio', str(attribute_value) ):
                        text = element.text.strip()
                        text = re.sub( r'\s+' , ' ' , text )
                        if element.get('data-type') != 'audio':
                            components.append( ('audio',text))
                            
                    elif re.search( r'g-graphic', str(attribute_value) ):
                        text = element.text.strip()
                        text = re.sub( r'\s+' , ' ' , text )
                        components.append( ('static-image',text))
                    
                    elif re.search( r'g-annotation', str(attribute_value) ) and not(re.search( r'g-annotation-inner', str(attribute_value) )):
                        text = element.text.strip()
                        text = re.sub( r'\s+' , ' ' , text )
                        components.append( ('dynamic-text',text))

                    elif re.search( r'text-unit', str(attribute_value) ):
                        components.append( ('text-flow',element.text.strip()))

                    elif re.search( r'g-image', str(attribute_value) ):
                        components.append( ('static-image',element.text.strip()))

                    elif re.search( r'g-scrollingparty-container', str(attribute_value) ):
                        components.append( ('dynamic-image-flow',element.text.strip()))

                    elif re.search( r'g-edplaybtn', str(attribute_value) ):
                        components.append( ('button',element.text.strip()))
                        
                    elif re.search( r'tapbtn', str(attribute_value) ):
                        components.append( ('button',element.text.strip()))
                        

            if len(components) == 0:

                scripts = soup.find_all('script')
                for s in scripts:
                    if re.search( 'var props =' , str(s) ):
                        lines = re.split( r'\n' , str(s) )
                        json_str = ''
                        for line in lines:
                            if re.search( 'var props =' , line):
                                json_str = re.sub( 'var props =' , '' , line ).strip()
                                index_slide = json_str.index('slides:') + len('slides:')
                                json_str = json_str[index_slide:]
                                json_str = re.sub(r'}$' , '' ,json_str.strip() ).strip()
                                json_str = re.sub( r'[\t\n]+' , '' , json_str )

                                index_slide = json_str.index('processedSlides:') 
                                json_str = json_str[:index_slide]
                                json_str = re.sub(',$', '' , json_str.strip() )


                        if len(json_str)>0:
                            #print(json_str)
                            out = open('article.json','w',encoding='utf-8')
                            out.write(json_str)
                            out.close()
                            json_data = json.loads(json_str)
                            for item in json_data:
                                if 'video' in item:
                                    components.append('video')

                                elif 'image' in item:
                                    #print(item['image'])
                                    components.append('static-image')

                                elif 'text' in item:
                                    #print(item['text'])
                                    components.append('text-flow')

            if len(components) > 0:
                 
                tr_in = 'none'
                tr_out = 'scroll'
                
                xml_file = re.sub( r'\.html' , '.xml' , file )
                out_path = os.path.join(annotation_dir,xml_file)
                out = open( out_path , 'w' , encoding = 'utf-8')

                out.write(f'<article title="{article_title}" publisher="{article_publisher}" year="{year_publication}" article_identifier="{(article_id):04d}">')
                current = ''
                for i,modality in enumerate(components):
                    if modality[0] == 'dynamic-image-flow':
                        screen_type = 'dynamic'
                    else:
                        screen_type = 'static'
                        
                    if modality[0] == 'scroll':                        
                        out.write( f'\n<scroll identifier="{(i+1):04d}"/>')
                    
                    if modality[0] != current or modality[0] == 'button':

                        out.write( f'\n<screen identifier="{(i+1):04d}" mode="{modality[0]}" type="{screen_type}" transition_in="{tr_in}" transition_out="{tr_out}"/>')
                        component_contents = re.sub( r'--' , '' , str(modality[1]) )

                        out.write( f'\n<!--{component_contents} -->' )
                        tr_in = tr_out
                    
                        current = modality[0] 

                out.write('\n</article>')   
                
                out.close()
                                    
        except:
            print(f'Problem with {file}')
                                    

print('Done!')

Done!


In [26]:
for i,file in enumerate(html_files):
    
    html_page = ''
    path = os.path.join(articles_dir,file)
    with open(path,encoding='utf-8') as fh:
        html_page = fh.read()
        
    if re.search( 'g-beforeafter' , html_page ):
        print(file)

blm-george-floyd-mural.html


In [None]:
american-sign-language-changes.
blm-george-floyd-mural.html