In [None]:
from zipfile import ZipFile 
from io import BytesIO
import re
from bs4 import BeautifulSoup as bs

import pandas as pd
import csv

In [None]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
# Loading ProQuest Data
files=[]
for i in range(1, 12):
  if i < 10:
    files.append('/drive/MyDrive/LDW 2021/Input/ProQuest-Datathon2021-Challenge-20210205T233945Z-00' + str(i) +'.zip')
  elif i >= 10:
    files.append('/drive/MyDrive/LDW 2021/Input/ProQuest-Datathon2021-Challenge-20210205T233945Z-0' + str(i) +'.zip')

print(files)

['/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-001.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-002.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-003.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-004.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-005.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-006.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-007.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-008.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-009.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-010.zip', '/drive/MyDrive/LDW 2021/ProQuest-Datathon2021-Challenge-20210205T233945Z-011.zip']


In [None]:
# Creating Hash Table to Extract only Advertisement Data
# dictionary with structure: { RecordID <string> : True <bool> }
# throws 'KeyError' if you try to access a RecordID that doesn't exist (meaning it's not in the ad index)
# indexDict = {}
# NOTE: I deleted last 10 lines of downloaded file because it has unrelated (meta?)data
nytFilePath = '/drive/MyDrive/LDW 2021/Input/NewYorkTimes-Advertisement-List.csv'
adwFilePath = '/drive/MyDrive/LDW 2021/Input/AtlantaDailyWorld-Advertisement-List.csv' 
with open(nytFilePath, mode='r') as nytFile:
    nytDict = {}
    reader = csv.reader(nytFile)
    nytDict = {rows[0]:True for rows in reader}

with open(adwFilePath, mode='r') as adwFile:
    adwDict = {}
    reader = csv.reader(adwFile)
    adwDict = {rows[0]:True for rows in reader}

indexDict = {**nytDict, **adwDict}
del indexDict['RecordID']

print('Number of Total Advertisements:', len(indexDict))

Number of Total Advertisements: 2133468


In [None]:
def parse_zip(zip_file):
  ad_list = list()
  counter = 0
  with ZipFile(zip_file, 'r') as zfile: 
      for name in zfile.namelist():
          if re.search(r'\.zip$', name) is not None:  # we have a zip within a zip
              print(name)
              zfiledata = BytesIO(zfile.read(name))
              with ZipFile(zfiledata) as zfile2:
                  for xml_name in zfile2.namelist():
                    counter += 1

                    if counter % 10000 == 0: print(f'{len(ad_list)} ads out of {counter} files')
                    
                    # if counter > 1000: break -> Uncomment for smaller output

                    # Check if xml file contains an advertisement; skip if does not
                    try:
                      indexDict[xml_name.split('.')[0]]
                    except KeyError:
                      continue

                    ad_obj = {}

                    # open xml file
                    xml_file = zfile2.open(xml_name)
                    content = xml_file.read()

                    # parse xml with BeautifulSoup
                    bs_content = bs(content, "xml")
                    
                    # get important fields from xml contents
                    # Record ID
                    recordid = bs_content.find('RecordID')
                    ad_obj['RecordID']=recordid.get_text()
                    
                    # Full Text
                    text = bs_content.find('FullText')
                    try:
                      ad_obj['text']=text.get_text()
                    except:
                      ad_obj['text']=None
                    
                    # Publishing Date
                    pub_date = bs_content.find('NumericPubDate')
                    ad_obj['pubDate']=pub_date.get_text()

                    # Publisher
                    publisher = bs_content.find('Publisher')
                    ad_obj['publisher']=publisher.get_text()

                    # Object Type
                    objType = bs_content.find('ObjectType')
                    ad_obj['type']=objType.get_text()

                    ad_list.append(ad_obj)

  return pd.json_normalize(ad_list)

In [None]:
fileCount=0
data_frames = []
for file in files:
  fileCount = fileCount + 1
  print('File Number:', fileCount)
  data_frames.append(parse_zip(file))

pd.concat(data_frames).to_csv('/drive/MyDrive/LDW 2021/Output/AdData.csv', index=False, encoding='utf-8')

File Number: 1
ProQuest-Datathon2021-Challenge/NewYorkTimes/NYT_20171009184408_00001.zip
ProQuest-Datathon2021-Challenge/AtlantaDailyWorld/ADW_20171104201106_00034.zip
2106 ads out of 10000 files
ProQuest-Datathon2021-Challenge/AtlantaDailyWorld/ADW_20171104200534_00013.zip
4176 ads out of 20000 files
6253 ads out of 30000 files
8302 ads out of 40000 files
ProQuest-Datathon2021-Challenge/AtlantaDailyWorld/ADW_20171104200641_00017.zip
10110 ads out of 50000 files
12141 ads out of 60000 files
ProQuest-Datathon2021-Challenge/NewYorkTimes/NYT_20171006222642_00032.zip
14695 ads out of 70000 files
17551 ads out of 80000 files
20740 ads out of 90000 files
ProQuest-Datathon2021-Challenge/AtlantaDailyWorld/ADW_20171104201000_00030.zip
22794 ads out of 100000 files
24895 ads out of 110000 files
ProQuest-Datathon2021-Challenge/AtlantaDailyWorld/ADW_20171104200433_00012.zip
26979 ads out of 120000 files
29113 ads out of 130000 files
31158 ads out of 140000 files
ProQuest-Datathon2021-Challenge/Atl