# **Código principal**

In [None]:
# -- Importação de pacotes e bibliotecas
import pandas as pd
import json
from urllib.request import urlopen
from io import StringIO

In [None]:
# -- Leitura do csv como dataframe
capitais_df = pd.read_csv('capitais.csv')

In [None]:
# -- Exibição das linhas iniciais
capitais_df.head()

Unnamed: 0,country,city,URL
0,Albania,Tirana,https://www.tirana.al/
1,Algeria,Algiers,https://www.apc-algercentre.dz/
2,Andorra,Andorra la Vella,https://www.andorra.ad/en/parroquia/andorra-la...
3,Antigua and Barbuda,Saint John's,https://www.cityofjohnstownpa.net/
4,Armenia,Yerevan,https://www.yerevan.am/


In [None]:
# -- Resumo do dataframe
capitais_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  132 non-null    object
 1   city     132 non-null    object
 2   URL      132 non-null    object
dtypes: object(3)
memory usage: 3.2+ KB


In [None]:
# -- Duplicação da coluna URL
capitais_df['domain'] = capitais_df['URL']

In [None]:
# -- Extração do domínio
capitais_df['domain'] = capitais_df['domain'].map(lambda url: url.replace('https://www.', ''))
capitais_df['domain'] = capitais_df['domain'].map(lambda url: url.replace('https://', ''))
capitais_df['domain'] = capitais_df['domain'].map(lambda url: url.replace('http://www.', ''))
capitais_df['domain'] = capitais_df['domain'].map(lambda url: url.replace('http://', ''))
capitais_df['domain'] = capitais_df['domain'].map(lambda url: url.replace(url[len(url)-1], '') if url.endswith('/') else url) # remoção da '/' ao final do domínio

In [None]:
# -- Exibição das linhas iniciais
capitais_df.head()

Unnamed: 0,country,city,URL,domain
0,Albania,Tirana,https://www.tirana.al/,tirana.al
1,Algeria,Algiers,https://www.apc-algercentre.dz/,apc-algercentre.dz
2,Andorra,Andorra la Vella,https://www.andorra.ad/en/parroquia/andorra-la...,andorra.ad/en/parroquia/andorra-la-vella
3,Antigua and Barbuda,Saint John's,https://www.cityofjohnstownpa.net/,cityofjohnstownpa.net
4,Armenia,Yerevan,https://www.yerevan.am/,yerevan.am


In [None]:
# -- Exibição das linhas finais
capitais_df.tail()

Unnamed: 0,country,city,URL,domain
127,Vanuatu,Port-Vila,https://www.portvilamunicipality.vu/,portvilamunicipality.vu
128,Virgin Islands,Charlotte Amalie,https://www.vi.gov/st-thomas-st-john-district/,vi.govst-thomas-st-john-district
129,Western Sahara,El Aaiún,Not Known,Not Known
130,Yemen,Sanaa,Not known,Not known
131,Zambia,Lusaka,https://www.lcc.gov.zm/,lcc.gov.zm


In [None]:
# -- Definição da coluna pdf_filter_URL para criação dos links de requisição com os filtros necessários

IA_URL_prefix = 'https://web.archive.org/cdx/search/cdx?url='
IA_URL_suffix = '&matchType=domain&filter=mimetype:application/pdf&fl=timestamp,urlkey,original&output=json'

capitais_df['pdf_filter_URL'] = capitais_df.loc[(capitais_df['domain'] != 'Not known') & (capitais_df['domain'] != 'Not Known')].apply(lambda row: IA_URL_prefix + row['domain'] + IA_URL_suffix, axis=1)

In [None]:
# -- Exibição das linhas iniciais
capitais_df.head()

Unnamed: 0,country,city,URL,domain,pdf_filter_URL
0,Albania,Tirana,https://www.tirana.al/,tirana.al,https://web.archive.org/cdx/search/cdx?url=tir...
1,Algeria,Algiers,https://www.apc-algercentre.dz/,apc-algercentre.dz,https://web.archive.org/cdx/search/cdx?url=apc...
2,Andorra,Andorra la Vella,https://www.andorra.ad/en/parroquia/andorra-la...,andorra.ad/en/parroquia/andorra-la-vella,https://web.archive.org/cdx/search/cdx?url=and...
3,Antigua and Barbuda,Saint John's,https://www.cityofjohnstownpa.net/,cityofjohnstownpa.net,https://web.archive.org/cdx/search/cdx?url=cit...
4,Armenia,Yerevan,https://www.yerevan.am/,yerevan.am,https://web.archive.org/cdx/search/cdx?url=yer...


In [None]:
# -- Exibição das linhas finais
capitais_df.tail()

Unnamed: 0,country,city,URL,domain,pdf_filter_URL
127,Vanuatu,Port-Vila,https://www.portvilamunicipality.vu/,portvilamunicipality.vu,https://web.archive.org/cdx/search/cdx?url=por...
128,Virgin Islands,Charlotte Amalie,https://www.vi.gov/st-thomas-st-john-district/,vi.govst-thomas-st-john-district,https://web.archive.org/cdx/search/cdx?url=vi....
129,Western Sahara,El Aaiún,Not Known,Not Known,
130,Yemen,Sanaa,Not known,Not known,
131,Zambia,Lusaka,https://www.lcc.gov.zm/,lcc.gov.zm,https://web.archive.org/cdx/search/cdx?url=lcc...


In [None]:
# -- Conversão do dataframe para CSV
capitais_df.to_csv('capitais_pdf_filter_URL.csv', index=False)

In [None]:
# -- Código para buscar os arquivos PDF disponibilizados para
# -- cada cidade com um link válido

# -- Definição de variáveis auxiliares
final_dataframe = []
response = None
data_json = None
jsonString = None
temp_df = None
header = None
original_pdf_URL = None
split_urlkey = None
capitais_URL = None
IA_pdf_URL = None
IA_prefix = 'https://web.archive.org/web/'

# -- Laço para percorrer o DataFrame capitais_df
for index, row in capitais_df.iterrows():
  if pd.notna(row['pdf_filter_URL']): # Condição de exclusão para cidades para as quais não conhecemos o link

    response = urlopen(row['pdf_filter_URL']) # Abrir o link com a requisição buscando arquivos PDF
    data_json = json.loads(response.read()) # Carregar a resposta
    jsonString = json.dumps(data_json) # Converter para JSON

    # -- Conversão do JSON para um Dataframe temporário
    # -- Este DataFrame será composto pelas colunas "timestamp", "urlkey" e "original"
    # -- que foram selecionadas no link de requisição
    temp_df = pd.read_json(StringIO(jsonString)) 

    # -- Readequação do cabeçalho: atribuição de "timestamp", "urlkey" e "original" como títulos das colunas
    try:
      header = temp_df.iloc[0]
      temp_df = temp_df[1:]
      temp_df.columns = header

      # -- Laço para percorrer temp_df de modo a coletar a URL original
      # -- e criar a URL de armazenamento no Internet Archive
      for index_temp_df, row_temp_df in temp_df.iterrows():
        original_pdf_URL = row_temp_df['original'] # Armazenamento da URL original do PDF
      
        split_urlkey = row_temp_df['urlkey'].split('/', 1) # Divisão de urlkey na primeira barra
        timestamp_url = row_temp_df['timestamp'] + 'if_/' # Coleta do timestamp e inclusão da cláusula if_ (para acesso direto ao arquivo PDF)
        capitais_URL = row['URL'] # URL do site presente em capitais_df
        IA_pdf_URL = IA_prefix + timestamp_url + capitais_URL + split_urlkey[1] # Criação do link do PDF no IA

        # -- Inserção dos dados em final_dataframe na forma de dicionário
        final_dataframe.append(
          {
            'country': row['country'],
            'city': row['city'],
            'original_pdf_URL': original_pdf_URL,
            'IA_pdf_URL': IA_pdf_URL
          }
        )

      # -- Redefinição das variáveis auxiliares
      response = None
      data_json = None
      jsonString = None
      temp_df = None
      header = None
      original_pdf_URL = None
      split_urlkey = None
      capitais_URL = None
      IA_pdf_URL = None

    # -- Tratamento de erros com a identificação de Index, Country e a exceção levantada
    except Exception as exc:
      print(f"Index: {index} ### Country: {row['country']} ### Error: {exc}")

# -- Criação de DataFrame a partir da lista de dicionários
capitais_final_df = pd.DataFrame(final_dataframe)

Index: 5 ### Country: Aruba ### Error: single positional indexer is out-of-bounds
Index: 6 ### Country: Azerbaijan ### Error: single positional indexer is out-of-bounds
Index: 7 ### Country: Bahamas ### Error: single positional indexer is out-of-bounds
Index: 8 ### Country: Bahrain ### Error: single positional indexer is out-of-bounds
Index: 11 ### Country: Benin ### Error: single positional indexer is out-of-bounds
Index: 15 ### Country: Botswana ### Error: single positional indexer is out-of-bounds
Index: 16 ### Country: Brunei ### Error: single positional indexer is out-of-bounds
Index: 21 ### Country: Cape Verde ### Error: single positional indexer is out-of-bounds
Index: 22 ### Country: Cayman Islands ### Error: single positional indexer is out-of-bounds
Index: 24 ### Country: Chad ### Error: single positional indexer is out-of-bounds
Index: 25 ### Country: Comoros ### Error: single positional indexer is out-of-bounds
Index: 26 ### Country: Cuba ### Error: single positional indexe

In [None]:
# -- Exibição das linhas iniciais
capitais_final_df.head()

Unnamed: 0,country,city,original_pdf_URL,IA_pdf_URL
0,Albania,Tirana,https://www.tirana.al/en/uploads/2020/12/20201...,https://web.archive.org/web/20210423052515if_/...
1,Albania,Tirana,https://www.tirana.al/en/uploads/2020/12/20201...,https://web.archive.org/web/20210503020353if_/...
2,Albania,Tirana,https://tirana.al/en/uploads/2020/12/202012101...,https://web.archive.org/web/20220302193349if_/...
3,Albania,Tirana,https://www.tirana.al/en/uploads/2020/12/20201...,https://web.archive.org/web/20210423054047if_/...
4,Albania,Tirana,https://www.tirana.al/en/uploads/2020/12/20201...,https://web.archive.org/web/20210503020444if_/...


In [None]:
# -- Resumo do dataframe
capitais_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329917 entries, 0 to 329916
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   country           329917 non-null  object
 1   city              329917 non-null  object
 2   original_pdf_URL  329917 non-null  object
 3   IA_pdf_URL        329917 non-null  object
dtypes: object(4)
memory usage: 10.1+ MB


In [None]:
# -- Quantidade de cidades distintas
capitais_final_df['city'].nunique()

64

In [None]:
# -- Conversão do dataframe para CSV
capitais_final_df.to_csv('capitais_final.csv', index=False)

# **Testes**

In [None]:
capitais_final_df['IA_pdf_URL'][1]

'https://web.archive.org/web/20210503020353if_/https://www.tirana.al/en/uploads/2020/12/20201210161510_sump_tirana-volume-i_status_analysis_200724.pdf'

In [None]:
from urllib.request import urlopen
import json

In [None]:
URL_example = 'https://web.archive.org/cdx/search/cdx?url=saipanmayor.net&matchType=domain&filter=mimetype:application/pdf&fl=timestamp,urlkey,original&output=json'

In [None]:
response = urlopen(URL_example)

In [None]:
data_json = json.loads(response.read())

In [None]:
print(data_json)

[['timestamp', 'urlkey', 'original'], ['20200927153039', 'net,saipanmayor)/resources/files/2019%20concession%20application-pg1docx%20new.pdf', 'https://saipanmayor.net/resources/files/2019%20CONCESSION%20APPLICATION-PG1docx%20NEW.pdf'], ['20200927164629', 'net,saipanmayor)/resources/files/2019%20parade%20and%20float%20application%20packet%20(1).pdf', 'https://saipanmayor.net/resources/files/2019%20Parade%20and%20Float%20Application%20Packet%20(1).pdf'], ['20200927170811', 'net,saipanmayor)/resources/files/application%20for%20employment%20final.pdf', 'https://saipanmayor.net/resources/files/Application%20for%20Employment%20Final.pdf'], ['20220625233129', 'net,saipanmayor)/resources/files/citizen%20centric%20reports/2019%20citizen%20centric%20report.pdf', 'https://www.saipanmayor.net/resources/files/Citizen%20Centric%20Reports/2019%20Citizen%20Centric%20Report.pdf'], ['20220630131922', 'net,saipanmayor)/resources/files/citizen%20centric%20reports/2019%20citizen%20centric%20report.pdf', '

In [None]:
jsonString = json.dumps(data_json)

In [None]:
type(jsonString)

str

In [None]:
print(jsonString)

[["timestamp", "urlkey", "original"], ["20200927153039", "net,saipanmayor)/resources/files/2019%20concession%20application-pg1docx%20new.pdf", "https://saipanmayor.net/resources/files/2019%20CONCESSION%20APPLICATION-PG1docx%20NEW.pdf"], ["20200927164629", "net,saipanmayor)/resources/files/2019%20parade%20and%20float%20application%20packet%20(1).pdf", "https://saipanmayor.net/resources/files/2019%20Parade%20and%20Float%20Application%20Packet%20(1).pdf"], ["20200927170811", "net,saipanmayor)/resources/files/application%20for%20employment%20final.pdf", "https://saipanmayor.net/resources/files/Application%20for%20Employment%20Final.pdf"], ["20220625233129", "net,saipanmayor)/resources/files/citizen%20centric%20reports/2019%20citizen%20centric%20report.pdf", "https://www.saipanmayor.net/resources/files/Citizen%20Centric%20Reports/2019%20Citizen%20Centric%20Report.pdf"], ["20220630131922", "net,saipanmayor)/resources/files/citizen%20centric%20reports/2019%20citizen%20centric%20report.pdf", "

In [None]:
df_example = pd.read_json(StringIO(jsonString))

In [None]:
df_example.head()

Unnamed: 0,0,1,2
0,timestamp,urlkey,original
1,20200927153039,"net,saipanmayor)/resources/files/2019%20conces...",https://saipanmayor.net/resources/files/2019%2...
2,20200927164629,"net,saipanmayor)/resources/files/2019%20parade...",https://saipanmayor.net/resources/files/2019%2...
3,20200927170811,"net,saipanmayor)/resources/files/application%2...",https://saipanmayor.net/resources/files/Applic...
4,20220625233129,"net,saipanmayor)/resources/files/citizen%20cen...",https://www.saipanmayor.net/resources/files/Ci...


In [None]:
header = df_example.iloc[0]
df_example = df_example[1:]
df_example.columns = header

In [None]:
df_example.head()

Unnamed: 0,timestamp,urlkey,original
1,20200927153039,"net,saipanmayor)/resources/files/2019%20conces...",https://saipanmayor.net/resources/files/2019%2...
2,20200927164629,"net,saipanmayor)/resources/files/2019%20parade...",https://saipanmayor.net/resources/files/2019%2...
3,20200927170811,"net,saipanmayor)/resources/files/application%2...",https://saipanmayor.net/resources/files/Applic...
4,20220625233129,"net,saipanmayor)/resources/files/citizen%20cen...",https://www.saipanmayor.net/resources/files/Ci...
5,20220630131922,"net,saipanmayor)/resources/files/citizen%20cen...",https://saipanmayor.net/resources/files/Citize...


In [None]:
df_example['IA_urlkey_pdf'] = df_example['urlkey']

In [None]:
df_example.head()

Unnamed: 0,timestamp,urlkey,original,IA_urlkey_pdf
1,20200927153039,"net,saipanmayor)/resources/files/2019%20conces...",https://saipanmayor.net/resources/files/2019%2...,"net,saipanmayor)/resources/files/2019%20conces..."
2,20200927164629,"net,saipanmayor)/resources/files/2019%20parade...",https://saipanmayor.net/resources/files/2019%2...,"net,saipanmayor)/resources/files/2019%20parade..."
3,20200927170811,"net,saipanmayor)/resources/files/application%2...",https://saipanmayor.net/resources/files/Applic...,"net,saipanmayor)/resources/files/application%2..."
4,20220625233129,"net,saipanmayor)/resources/files/citizen%20cen...",https://www.saipanmayor.net/resources/files/Ci...,"net,saipanmayor)/resources/files/citizen%20cen..."
5,20220630131922,"net,saipanmayor)/resources/files/citizen%20cen...",https://saipanmayor.net/resources/files/Citize...,"net,saipanmayor)/resources/files/citizen%20cen..."


In [None]:
df_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 1 to 28
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   timestamp      28 non-null     object
 1   urlkey         28 non-null     object
 2   original       28 non-null     object
 3   IA_urlkey_pdf  28 non-null     object
dtypes: object(4)
memory usage: 1.0+ KB


In [None]:
s = "net,saipanmayor)/resources/files/2019%20parade%20and%20float%20application%20packet%20(1).pdf"
new_prefix = "https://web.archive.org/web/"
split_s = s.split('/', 1)
print(split_s)
new_s = new_prefix + '20220625233129' + 'if_/https://www.saipanmayor.net/' + split_s[1]
print(new_s)

['net,saipanmayor)', 'resources/files/2019%20parade%20and%20float%20application%20packet%20(1).pdf']
https://web.archive.org/web/20220625233129if_/https://www.saipanmayor.net/resources/files/2019%20parade%20and%20float%20application%20packet%20(1).pdf


In [None]:
# -- Modelo 1
f'https://web.archive.org/cdx/search/cdx?url={clean_url}&matchType=domain&filter=mimetype:application/pdf&fl=timestamp,urlkey&output=json'

In [None]:
# -- Modelo 2
f'https://web.archive.org/web/{timestamp}if_/{coluna_URL}/areas/ciudad/historico/especiales/9julio/acta_independencia.pdf'

In [None]:
import requests
from requests.exceptions import HTTPError

In [None]:
content = None
URL = 'https://web.archive.org/cdx/search/cdx?url=buenosaires.gob.ar&matchType=domain&filter=mimetype:application/pdf&fl=timestamp,urlkey&output=json'

try:
  response = requests.get(URL)
  response.raise_for_status()
except HTTPError as exc:
  print(exc)
else:
  content = response.text

print(type(content))

<class 'str'>


In [None]:
buenos_aires_df = pd.read_json('/content/cdx_buenos_aires.json')

In [None]:
buenos_aires_df.head()

Unnamed: 0,0,1
0,timestamp,urlkey
1,20130810075538,"ar,gob,buenosaires)/aplicaciones/guiaba2/virtu..."
2,20221221063018,"ar,gob,buenosaires)/apps/contenido/archivos/10..."
3,20210309144807,"ar,gob,buenosaires)/apps/contenido/archivos/10..."
4,20220119110328,"ar,gob,buenosaires)/apps/contenido/archivos/11..."


In [None]:
buenos_aires_df.tail()

Unnamed: 0,0,1
28159,20220809174200,"ar,gob,buenosaires,vivienda)/guiainquilinospro..."
28160,20221201204246,"ar,gob,buenosaires,vivienda)/organigrama-novie..."
28161,20220930065626,"ar,gob,buenosaires,vivienda)/organigrama-septi..."
28162,20220809160405,"ar,gob,buenosaires,vivienda)/participacionmesa..."
28163,20220809155251,"ar,gob,buenosaires,vivienda)/participacionplan..."


In [None]:
header = buenos_aires_df.iloc[0]
buenos_aires_df = buenos_aires_df[1:]
buenos_aires_df.columns = header

In [None]:
buenos_aires_df.head()

Unnamed: 0,timestamp,urlkey
1,20130810075538,"ar,gob,buenosaires)/aplicaciones/guiaba2/virtu..."
2,20221221063018,"ar,gob,buenosaires)/apps/contenido/archivos/10..."
3,20210309144807,"ar,gob,buenosaires)/apps/contenido/archivos/10..."
4,20220119110328,"ar,gob,buenosaires)/apps/contenido/archivos/11..."
5,20220119114714,"ar,gob,buenosaires)/apps/contenido/archivos/11..."
