# THLOP - SHODAN'S DATASET API

In [None]:
from pyspark.sql import SparkSession

# Note: Make sure to set an environment variable called "TLHOP_DATASETS_PATH" 
# used to define where THLOP's Crawlers store their collected data.

spark = SparkSession.builder\
            .master("local[4]")\
            .config("spark.driver.memory", "10g")\
            .getOrCreate()

In [2]:
from tlhop.datasets import DataSets

ds = DataSets()

In [3]:
ds.list_datasets()

Unnamed: 0,Code name,Description,Type,Downloaded,Size (MB),Last timestamp
0,ACCENTED_WORDS_PT_BR_FILE,List of accented pt-br words,internal,True,4.14,"12/11/2023, 19:42:54"
1,AS_RANK_FILE,CAIDA's AS Rank,external,True,12.56,"09/02/2023, 15:30:32"
2,AS_TYPE_FILE,CAIDA's AS Classification,external,True,2.18,"09/02/2023, 15:31:16"
3,BRAZILIAN_CITIES,Brazil's cities information dataset,external,True,3.07,"09/02/2023, 15:31:16"
4,BRAZILIAN_IPS,Lisf of Range of IPs available in Brazil's Internet,external,True,5.08,"09/02/2023, 15:30:36"
5,BRAZILIAN_RF,Brazilian National Register of Legal Entities - CNPJ,external,True,6125.92,"02/16/2024, 13:58:42"
6,CISA_EXPLOITS,CISA's Known Exploited Vulnerabilities Catalog,external,True,0.34,"09/02/2023, 15:30:39"
7,COLUMNS_MODULE_FILE,Shodan's Module list and their columns frequency,external,True,0.09,"09/02/2023, 15:30:40"
8,ENDOFLIFE,Keep track of various End of Life dates and support lifecycles for various products,external,True,1.96,"09/02/2023, 15:09:39"
9,FIRST_EPSS,FIRST's Exploit Prediction Scoring system (EPSS),external,True,946.08,"03/22/2024, 14:20:59"


## NVD's CVE Library

In [4]:
ds1 = ds.read_dataset("NVD_CVE_LIB", check_update=False)
ds1.count()

                                                                                

242524

In [5]:
ds1.printSchema()

root
 |-- cve_id: string (nullable = true)
 |-- cvss_score: double (nullable = true)
 |-- cvss_version: string (nullable = true)
 |-- description: string (nullable = true)
 |-- publishedDate: date (nullable = true)
 |-- lastModifiedDate: string (nullable = true)
 |-- cvss_v2: struct (nullable = true)
 |    |-- acInsufInfo: boolean (nullable = true)
 |    |-- exploitabilityScore: double (nullable = true)
 |    |-- impactScore: double (nullable = true)
 |    |-- obtainAllPrivilege: boolean (nullable = true)
 |    |-- obtainOtherPrivilege: boolean (nullable = true)
 |    |-- obtainUserPrivilege: boolean (nullable = true)
 |    |-- severity: string (nullable = true)
 |    |-- userInteractionRequired: boolean (nullable = true)
 |    |-- accessComplexity: string (nullable = true)
 |    |-- accessVector: string (nullable = true)
 |    |-- authentication: string (nullable = true)
 |    |-- availabilityImpact: string (nullable = true)
 |    |-- score: double (nullable = true)
 |    |-- confiden

## CISA's Known Exploited Vulnerabilities Catalog

In [6]:
ds2 = ds.read_dataset("CISA_EXPLOITS")
ds2.count()

973

In [7]:
ds2.printSchema()

root
 |-- cve_id: string (nullable = true)
 |-- vendorProject: string (nullable = true)
 |-- product: string (nullable = true)
 |-- vulnerabilityName: string (nullable = true)
 |-- dateAdded: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- requiredAction: string (nullable = true)
 |-- dueDate: string (nullable = true)
 |-- notes: string (nullable = true)



## HTTP Status Code List

In [8]:
ds2 = ds.read_dataset("HTTP_STATUS_FILE")
ds2.count()

99

In [9]:
ds2.printSchema()

root
 |-- code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- group: string (nullable = true)
 |-- reference: string (nullable = true)



## UTF-8 Code symbols 

In [10]:
ds3 = ds.read_dataset("UTF8_MAPPING_FILE")
ds3.count()

196

In [11]:
ds3.printSchema()

root
 |-- Expected: string (nullable = true)
 |-- Actual: string (nullable = true)



## AS Rank DataSet

In [12]:
ds4 = ds.read_dataset("AS_RANK_FILE")
ds4.count()

                                                                                

112490

In [13]:
ds4.printSchema()

root
 |-- asn: string (nullable = true)
 |-- asnName: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- cliqueMember: string (nullable = true)
 |-- seen: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- asnDegree_provider: string (nullable = true)
 |-- asnDegree_peer: string (nullable = true)
 |-- asnDegree_customer: string (nullable = true)
 |-- asnDegree_total: string (nullable = true)
 |-- asnDegree_transit: string (nullable = true)
 |-- asnDegree_sibling: string (nullable = true)
 |-- organization_orgId: string (nullable = true)
 |-- organization_orgName: string (nullable = true)
 |-- country_iso: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- announcing_numberPrefixes: string (nullable = true)
 |-- announcing_numberAddresses: string (nullable = true)



## AS Type DataSet

In [14]:
ds5 = ds.read_dataset("AS_TYPE_FILE")
ds5.count()

                                                                                

71665

In [15]:
ds5.printSchema()

root
 |-- asn: string (nullable = true)
 |-- class: string (nullable = true)
 |-- type: string (nullable = true)



## Brazil Cities DataSet

In [16]:
ds6 = ds.read_dataset("BRAZILIAN_CITIES")
ds6.count()

24/04/08 17:44:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

5578

In [17]:
ds6.printSchema()

root
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- CAPITAL: string (nullable = true)
 |-- IBGE_RES_POP: string (nullable = true)
 |-- IBGE_RES_POP_BRAS: string (nullable = true)
 |-- IBGE_RES_POP_ESTR: string (nullable = true)
 |-- IBGE_DU: string (nullable = true)
 |-- IBGE_DU_URBAN: string (nullable = true)
 |-- IBGE_DU_RURAL: string (nullable = true)
 |-- IBGE_POP: string (nullable = true)
 |-- IBGE_1: string (nullable = true)
 |-- IBGE_1-4: string (nullable = true)
 |-- IBGE_5-9: string (nullable = true)
 |-- IBGE_10-14: string (nullable = true)
 |-- IBGE_15-59: string (nullable = true)
 |-- IBGE_60+: string (nullable = true)
 |-- IBGE_PLANTED_AREA: string (nullable = true)
 |-- IBGE_CROP_PRODUCTION_$: string (nullable = true)
 |-- IDHM Ranking 2010: string (nullable = true)
 |-- IDHM: string (nullable = true)
 |-- IDHM_Renda: string (nullable = true)
 |-- IDHM_Longevidade: string (nullable = true)
 |-- IDHM_Educacao: string (nullable = true)
 |-- LO

## Mikrotik OS Release Version

In [18]:
ds8 = ds.read_dataset("MIKROTIK_OS")
ds8.count()

396

In [19]:
ds8.printSchema()

root
 |-- deployment: string (nullable = true)
 |-- release: string (nullable = true)
 |-- date: string (nullable = true)



## Language code (iso 639) mapping list

In [20]:
ds9 = ds.read_dataset("ISO_639_LANGUAGE")
ds9.count()

184

In [21]:
ds9.printSchema()

root
 |-- iso: string (nullable = true)
 |-- language: string (nullable = true)



## PT-BR Accented words

In [22]:
ds10 = ds.read_dataset("ACCENTED_WORDS_PT_BR_FILE")
ds10.count()

                                                                                

287883

In [23]:
ds10.printSchema()

root
 |-- word: string (nullable = true)



## PT-BR Dictionary

In [24]:
ds11 = ds.read_dataset("PT_BR_DICTIONARY")
ds11.count()

                                                                                

980279

In [25]:
ds11.printSchema()

root
 |-- word: string (nullable = true)



## Federal Revenue of Brazil

In [26]:
ds12 = ds.read_dataset("BRAZILIAN_RF")
ds12.count()

                                                                                

71002229

In [27]:
ds12.printSchema()

root
 |-- cnpj_basico: string (nullable = true)
 |-- identificador_matriz: string (nullable = true)
 |-- nome_fantasia: string (nullable = true)
 |-- situacao_cadastral: string (nullable = true)
 |-- situacao_cadastral_data: date (nullable = true)
 |-- nome_cidade_exterior: string (nullable = true)
 |-- data_inicio_atividade: date (nullable = true)
 |-- cnae_fiscal_principal_cod: string (nullable = true)
 |-- cnae_fiscal_secundaria_cod: string (nullable = true)
 |-- endereço_tipo_logradouro: string (nullable = true)
 |-- endereço_logradouro: string (nullable = true)
 |-- endereço_numero: integer (nullable = true)
 |-- endereço_complemento: string (nullable = true)
 |-- endereço_bairro: string (nullable = true)
 |-- endereço_cep: integer (nullable = true)
 |-- endereço_uf: string (nullable = true)
 |-- contato_ddd1: integer (nullable = true)
 |-- contato_telone1: integer (nullable = true)
 |-- contato_email: string (nullable = true)
 |-- situacao_especial: string (nullable = true)
 |-- 

## FIRST EPSS

In [28]:
ds13 = ds.read_dataset("FIRST_EPSS", check_update=False)
ds13.count()

                                                                                

175851655

In [29]:
ds13.printSchema()

root
 |-- cve_id: string (nullable = true)
 |-- score_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- model_version: string (nullable = true)
 |-- epss: double (nullable = true)
 |-- percentile: double (nullable = true)



## LACNIC RIR Statistics

In [30]:
ds14 = ds.read_dataset("LACNIC_STATISTICS", check_update=False)
ds14.count()

                                                                                

49110601

In [31]:
ds14.printSchema()

root
 |-- registry: string (nullable = true)
 |-- country: string (nullable = true)
 |-- type: string (nullable = true)
 |-- mask: string (nullable = true)
 |-- n_ips: long (nullable = true)
 |-- date: long (nullable = true)
 |-- status: string (nullable = true)
 |-- start_ip_int: double (nullable = true)
 |-- end_ip_int: double (nullable = true)
 |-- crawler_date: date (nullable = true)
 |-- crawler_year: long (nullable = true)



In [32]:
spark.stop()