# Creating a big table for ShareCare
This notebook describes how to create a managed table from Spark. 
The table is created in the Synapse warehouse folder in your primary storage account. The table will be synchronized and available in Synapse SQL Pools. 


## 1. Load the data

In [11]:
import pandas

StatementMeta(SampleSpark, 7, 11, Finished, Available)

### Loading user records from DL

In [12]:
df_users = pandas.read_parquet('abfss://sharecare-fs@sharecaredatalake.dfs.core.windows.net/raw/users/users.parquet')
display(df_users)

StatementMeta(SampleSpark, 7, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, 244bd8a4-6347-4e01-9c67-9a1994a2e2a1)

### Loading post records from DL

Renaming column 'desription' to 'text' and columnd 'id' to 'postId'. Deleting columns 'charityName' and 'userName', redundant.

In [13]:
df_posts = pandas.read_parquet('abfss://sharecare-fs@sharecaredatalake.dfs.core.windows.net/raw/posts/posts.parquet')
df_posts.rename(columns={'description':'text', 'id':'postId'}, inplace = True)
df_posts.drop(['charityName', 'userName'], axis=1, inplace=True)
display(df_posts)

StatementMeta(SampleSpark, 7, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1da446d7-4aee-4587-b7f7-974bc86688c8)

In [14]:
df_media = pandas.read_parquet('abfss://sharecare-fs@sharecaredatalake.dfs.core.windows.net/raw/media/media.parquet')
df_media.rename(columns={'url': 'imageUrl'}, inplace=True)
display(df_media)

StatementMeta(SampleSpark, 7, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, 907e7ac0-c7b6-48a9-a92c-4509aacb72a2)

In [None]:
df_charities = pandas.read_parquet('abfss://sharecare-fs@sharecaredatalake.dfs.core.windows.net/raw/charities/charities.parquet')
df_charities.rename(columns={'createdByUser': 'charityOwner'}, inplace=True)
df_charities.drop('funds', axis=1, inplace=True)
display(df_charities)

## 2. Transform the data

In [15]:
df_posts_media = pandas.merge(left=df_posts, right=df_media, left_on='mediaId', right_on='id')
df_posts_media.drop('id', axis=1, inplace=True)
display(df_posts_media)

StatementMeta(SampleSpark, 7, 15, Finished, Available)

SynapseWidget(Synapse.DataFrame, 162338a5-17c0-49cd-b69c-28c0b32434e1)

In [16]:
df_posts_media_charities = pandas.merge(left=df_posts_media, right=df_charities, left_on='charityId', right_on='id')
df_posts_media_charities.drop('id', axis=1, inplace=True)
display(df_posts_media_charities)

StatementMeta(SampleSpark, 7, 16, Finished, Available)

SynapseWidget(Synapse.DataFrame, 025fb146-66b3-4b8d-b872-c1b4bdc6e68e)

In [17]:
df_posts_media_charities_users = pandas.merge(left=df_posts_media_charities, right=df_users, left_on='userId', right_on='id')
df_posts_media_charities_users.drop(['id', 'charityId', 'userId', 'mediaId'], axis=1, inplace=True)
display(df_posts_media_charities_users)

StatementMeta(SampleSpark, 7, 17, Finished, Available)

SynapseWidget(Synapse.DataFrame, a373cc53-b3ae-4865-a88b-1f1b8c4732e3)

In [18]:
display(df_posts_media_charities_users)

StatementMeta(SampleSpark, 7, 18, Finished, Available)

SynapseWidget(Synapse.DataFrame, dcfd2e2b-2f9b-41dc-a129-6ab0615284bb)

## 3. Save processed data back to DL

In [19]:
spark.sql("CREATE DATABASE IF NOT EXISTS ShareCareDL")

spark_df = spark.createDataFrame(df_posts_media_charities_users)
spark_df.write.mode("overwrite").saveAsTable("ShareCareDL.PostsMerged")

StatementMeta(SampleSpark, 7, 19, Finished, Available)

## 4. Create PDF documents

In [59]:
%%sh
rm -f template.pug
wget https://raw.githubusercontent.com/nephoseu/sharecare-app-six/main/dataops/template.pug
ls

StatementMeta(SampleSpark, 7, 59, Finished, Available)

--2022-11-20 21:27:07--  https://raw.githubusercontent.com/nephoseu/sharecare-app-six/main/dataops/template.pug
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 266 [text/plain]
Saving to: ‘template.pug’

     0K                                                       100% 9.22M=0s

2022-11-20 21:27:07 (9.22 MB/s) - ‘template.pug’ saved [266/266]



In [36]:
# spark.conf.set(
#     "fs.azure.account.key.sharecareblob.blob.core.windows.net", "thltYmZDEbzwEBsYVtLnS/W2XpUKlUUdT7atxKWfqN1hFmEJDSxYckKrbgFhp/0bF9MxTCGxzowL+AStkPCiDA=="
# )
blob_path = "abfss://documents@sharecaredatalake.dfs.windows.cor.net/"

StatementMeta(SampleSpark, 7, 36, Finished, Available)

In [45]:
from pdf_reports import pug_to_html, write_report
import fsspec
import os

def createDocument(postId, text, imageUrl, name, firstName, lastName, eMail, country):
    title = str(postId) + ". " + text[0:20] + " - " + name
    author = firstName + " " + lastName
    html = pug_to_html("template.pug", 
        image_url = imageUrl,
        title=title,
        country=country,
        author=author,
        email = eMail,
        content=text)
    local_file_name = "post" + str(postId) + ".pdf"
    write_report(html, local_file_name)

    fs = fsspec.filesystem('abfss')
    fs.upload(local_file_name, blob_path + local_file_name)

    return 1

StatementMeta(SampleSpark, 7, 45, Finished, Available)

In [60]:
%%sh
ls

StatementMeta(SampleSpark, 7, 60, Finished, Available)

__spark_conf__
container_tokens
default_container_executor.sh
default_container_executor_session.sh
launch_container.sh
post1.pdf
sparkr
template.pug
tmp


In [54]:
%%sh
ls

StatementMeta(SampleSpark, 7, 54, Finished, Available)

__spark_conf__
container_tokens
default_container_executor.sh
default_container_executor_session.sh
launch_container.sh
sparkr
template.pug
tmp


In [61]:
# Naredba za testiranje funkcije createDocument
# createDocument(1, "Neki tekst posta...", "https://storage.googleapis.com/sharecaremedia/images/charity1.png", "Moj charity", "Tomislav", "Tipuric", "totipu@nephos.eu", "Croatia")

StatementMeta(SampleSpark, 7, 61, Finished, Available)

Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.


1

In [67]:
df_posts_media_charities_users.apply(lambda row: createDocument(
    row["postId"], 
    row["text"], 
    row["imageUrl"], 
    row["name"], 
    row["firstName"], 
    row["lastName"], 
    row["eMail"],
    row["country"]), axis = 1)

StatementMeta(SampleSpark, 7, 67, Finished, Available)

Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `background-color: lighten(--primary-color, 33%)` at 32:3, invalid value.
Ignored `backgro

0      1
1      1
2      1
3      1
4      1
      ..
97     1
98     1
99     1
100    1
101    1
Length: 102, dtype: int64