In [0]:
!pip install kafka-python
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import col, from_json
spark = SparkSession.builder.appName("Genomic").getOrCreate()

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-b65f2ffe-8941-4517-aea6-af1b7167e284/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
df_genomic = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers","localhost:9092,localhost:9095,localhost:9093")
    .option("subscribe", "genomic")
    .option("startingOffsets", "earliest"))
df_genomic = df_genomic.load()

In [0]:
schema = StructType([
        StructField('source', StructType([
             StructField('id', StringType(), True),
             StructField('name', StringType(), True)
             ])),
         StructField('author', StringType(), True),
         StructField('title', StringType(), True),
         StructField('description', StringType(), True),
         StructField('url', StringType(), True),
         StructField('urlToImage', StringType(), True),
         StructField('publishedAt', StringType(), True),
         StructField('content', IntegerType(), True)
         ])

df_genomic_out = df_genomic.selectExpr("CAST(value AS STRING)") \
.select(from_json(col("value"), schema).alias("news")) \
.select("news.*")

In [0]:
(
    df_genomic_out.writeStream
        .format("console")
        .start()
)

Out[11]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f764082a970>

In [0]:
df_genomic_out.writeStream.format("parquet") \
.option("path", "dbfs:/tmp/output/genomic/") \
.option("checkpointLocation", "dbfs:/tmp/checkpoints/projeto") \
.start()

Out[20]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f7623eadf40>

In [0]:
file_location = "dbfs:/tmp/output/genomic/"
file_type = "parquet"
df_spark = spark.read.format(file_type).load(file_location)

In [0]:
df_temp=df_spark.select(col("source.id").alias("id"),
                col("source.name").alias("name"),
                col("author"),
                col("title"),
                col("description"),
                col("url"),
                col("urlToImage"),
                col("publishedAt"),
                col("content"))

In [0]:
file_final = "dbfs:/final/output/genomic/"
file_type = "parquet"
df_comp = spark.read.format(file_type).load(file_location)

df_comp=df_comp.select(col("source.id").alias("id"),
                col("source.name").alias("name"),
                col("author"),
                col("title"),
                col("description"),
                col("url"),
                col("urlToImage"),
                col("publishedAt"),
                col("content"))

df_final=df_comp.union(df_temp).dropDuplicates(["url"])
df_final.write.mode("overwrite").parquet("dbfs:/final/output/genomic/")



In [0]:
df_final.display()

id,name,author,title,description,url,urlToImage,publishedAt,content
,,,,,,,,
,The Intercept,Liliana Segura,DNA Evidence Sent Anthony Sanchez to Death Row. But Did It Actually Solve the Crime?,A college ballerina was raped and murdered in Oklahoma. DNA put Anthony Sanchez at the scene. But it did not tell the whole story. The post DNA Evidence Sent Anthony Sanchez to Death Row. But Did It Actually Solve the Crime? appeared first on The Intercept.,http://theintercept.com/2023/09/18/oklahoma-execution-dna-anthony-sanchez/,https://theintercept.com/wp-content/uploads/2023/09/anthonysanchez_lead_final_the-intercept.jpg?fit=2160%2C1080&w=1200&h=800,2023-09-18T11:00:00Z,
abc-news,ABC News,GERALD IMRAY Associated Press,South Africa announces official inquiry into deadly Johannesburg building fire,The South African government says a retired judge will lead an official inquiry into last week's deadly fire at a derelict building in downtown Johannesburg,https://abcnews.go.com/International/wireStory/south-african-government-announces-official-inquiry-deadly-johannesburg-102924141,https://s.abcnews.com/images/International/wirestory_8f16805b89fccaa1291454e86db2f670_16x9_992.jpg,2023-09-05T12:33:02Z,
abc-news,ABC News,SEAN MURPHY Associated Press,Man executed for the 1996 killing of a University of Oklahoma dance student,A man convicted of the 1996 killing of a University of Oklahoma dance student has been executed,https://abcnews.go.com/US/wireStory/man-set-executed-1996-slaying-university-oklahoma-dance-103369263,https://s.abcnews.com/images/US/abc_news_default_2000x2000_update_16x9_992.jpg,2023-09-21T16:21:55Z,
abc-news,ABC News,ANDREW SELSKY Associated Press,Oregon man who was sentenced to death is free 2 years after murder conviction was reversed,"A man sentenced to death for a 1998 murder is now free, two years after the Oregon Court of Appeals reversed the conviction",https://abcnews.go.com/US/wireStory/oregon-man-sentenced-death-free-2-years-after-102982452,https://i.abcnewsfe.com/a/3f18ab0e-36b3-4251-ae5c-965228ce27b2/wirestory_e2b065ae1d19b8ca3698a86a9a9f61e5_16x9.jpg?w=992,2023-09-07T01:33:37Z,
abc-news,ABC News,The Associated Press,Remains identified of airman who died in crash following WWII bombing raid on Japan,Military scientists have identified the remains of a U.S. Army airman from Michigan who died along with 10 other crew members when a bomber crashed in India following a World War II bombing raid on Japan,https://abcnews.go.com/US/wireStory/remains-identified-michigan-airman-died-crash-wwii-bombing-103031035,https://s.abcnews.com/images/US/abc_news_default_2000x2000_update_16x9_992.jpg,2023-09-08T18:04:34Z,
abc-news,ABC News,The Associated Press,Woman charged with abandoning newborn girl in New Jersey park nearly 40 years ago,The mother of a newborn girl whose body was found in a New Jersey park on Christmas Eve nearly 40 years ago has been identified and is now charged in the death,https://abcnews.go.com/US/wireStory/woman-charged-abandoning-newborn-girl-new-jersey-park-103011000,https://s.abcnews.com/images/US/abc_news_default_2000x2000_update_16x9_992.jpg,2023-09-07T21:17:43Z,
,Abduzeedo.com,abduzeedo,Los Kalakos Tequila Blanco: Minimalist Packaging Design,Los Kalakos Tequila Blanco: Minimalist Packaging Design  abduzeedo0913—23  Los Kalakos' Tequila Blanco 2023 isn’t just another beverage in the spirits aisle – it's a masterclass in brandjng and packaging design presented on a bottle. Commissioned by the il…,https://abduzeedo.com/node/88049,,2023-09-13T21:09:25Z,
ars-technica,Ars Technica,Beth Mole,"After being demoted and forced to retire, mRNA researcher wins Nobel",Katalin Karikó and Drew Weissman awarded Nobel Prize in Physiology or Medicine.,https://arstechnica.com/health/2023/10/after-being-demoted-and-forced-to-retire-mrna-researcher-wins-nobel/,https://cdn.arstechnica.net/wp-content/uploads/2023/10/GettyImages-1701994937-760x380.jpeg,2023-10-02T19:06:09Z,
ars-technica,Ars Technica,Elizabeth Rayne,"Genomes could help enigmatic, endangered nocturnal parrot make a comeback","Variations linked to fertility, survival still present in a small population.",https://arstechnica.com/science/2023/09/genomes-could-help-enigmatic-endangered-nocturnal-parrot-make-a-comeback/,https://cdn.arstechnica.net/wp-content/uploads/2023/09/GettyImages-1287134021-760x380.jpg,2023-09-12T16:57:48Z,
