Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

## This cell configures the spark session - Do not change

In [None]:
%%configure -f
{
"conf": {
     "spark.rpc.message.maxSize": 1024,
     "spark.kryoserializer.buffer.max": "256m"
   }
}

StatementMeta(, 45, -1, Finished, Available)

## These are the parameters that need to be changed to your values

In [None]:
# The input file directory
input_directory = "abfss://share@datadiscoverypipeline.dfs.core.windows.net/bbcsports"
# The output directory where the output file will be written to
output_directory = 'abfss://share@datadiscoverypipeline.dfs.core.windows.net/videos_outputs/'
# The name of the output file
output_filename = 'bbc_text_summarisation.csv'
# If this is set to True then the Coalesce notebook will need to be run to merge the partition files into a single file
LOW_MEMORY_MODE = True

# AML Experiment tracking

# Azure SubscriptionId
subscription_id=""
# AzureML Workspace Resource Group
resource_group=""
# AzureML Workspace Name
workspace_name=""

StatementMeta(DataDiscovery, 45, 1, Finished, Available)

## Track the Experiment in Azure ML

In [None]:
from azureml.core import Workspace, Experiment, Run
import mlflow

ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name)    
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
experiment_name = f"({mssparkutils.runtime.context['notebookname']}_{str(mssparkutils.env.getJobId())})"
mlflow.set_experiment(experiment_name)
mlflow.log_param("input_filename", input_filename)
mlflow.log_param("output_directory", output_directory)
mlflow.log_param("output_filename", output_filename)
mlflow.log_param("LOW_MEMORY_MODE", LOW_MEMORY_MODE)
params = {
    "sparkpool": mssparkutils.runtime.context['sparkpool'],
    "workspace": mssparkutils.runtime.context['workspace'],
    "notebookname": mssparkutils.runtime.context['notebookname'],
    "isForPipeline": mssparkutils.runtime.context['isForPipeline'],
    "pipelinejobid": mssparkutils.runtime.context['pipelinejobid']
}

mlflow.log_params(params)
mlflow.pyspark.ml.autolog()

In [None]:
from  pyspark.sql.functions import input_file_name
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
from pyspark.sql.functions import spark_partition_id
from pyspark.ml.linalg import Vectors, VectorUDT
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import os
from graphframes import *

df = spark.read.text(input_directory, wholetext=True)   
df = df.withColumn("filename", input_file_name())

StatementMeta(DataDiscovery, 45, 7, Finished, Available)

# Run the abstractive summarisation process

In [None]:
import numpy as np
import torch


def pegasus_summarise_text(df):

    model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

    text_abs = []
    text_abs = df.value.values.tolist()

    summaries = []
    for c in text_abs:
        inputs = tokenizer(c, max_length=512, return_tensors="pt", truncation=True)
        summary_ids = model.generate(inputs["input_ids"])
        summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        summaries.append(summary)

    return_df = (
        df[["filename"]]
        .assign(value=summaries)
    )
    return return_df

StatementMeta(DataDiscovery, 45, 8, Finished, Available)

# Define the schema

In [None]:
summary_schema = StructType(
    [
        StructField("filename", StringType(), True),
        StructField("value", StringType(), True)
    ]
)

summary_df = (
    df
    .groupBy("filename")
    .applyInPandas(pegasus_summarise_text, summary_schema)
)


StatementMeta(DataDiscovery, 45, 9, Finished, Available)

## Save the outcome with two operations (usual or coalesce)

In [None]:
df_graph = summary_df

if LOW_MEMORY_MODE:
    summary_df.write.mode('overwrite').options(header='true').csv(os.path.join(output_directory, output_filename))
else:
    summary_df.coalesce(1).write.mode('overwrite').options(header='true').csv(os.path.join(output_directory, output_filename))

mlflow.pyspark.ml.mlflow.end_run()

StatementMeta(DataDiscovery, 45, 10, Finished, Available)

# Optional: Add Azure Cognitive Search

## Add Search Parameters

In [None]:
# Azure Search Admin Key
search_admin_key = ""
# The name of the search service
search_service_name = ""
# The Azure Search Query Key
search_query_key = ""

In [None]:
from synapse.ml.cognitive import *
from pyspark.sql.functions import monotonically_increasing_id, lit

df = df.drop("_c0")

(
    df.withColumn("key", monotonically_increasing_id().cast("string"))
    .withColumn("SearchAction", lit("upload"))
    .writeToAzureSearch(
        subscriptionKey=search_admin_key,
        actionCol="SearchAction",
        serviceName=search_service_name,
        indexName=experiment_name,  # Defaults to the notebook name
        keyCol="key",
    )
)

## Search the generated Azure Search Index

In [None]:
import requests
from IPython.display import display, Markdown

term_to_search_for = "covid"

url = "https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06".format(
    search_service_name, experiment_name

)
jdata = requests.post(url, json={"search": term_to_search_for}, headers={"api-key": search_query_key}).json()

for doc in jdata['value']:
    display(Markdown(f'**Search Score {doc["@search.score"]}** Document {doc["filename"]}'))
    display(Markdown(f'{doc["text"]}'))


## Implement Semantic Search

1) [Enable Semantic Search](https://docs.microsoft.com/en-us/azure/search/semantic-search-overview#enable-semantic-search) on your search instance

2) [Configure Semantic Search](https://docs.microsoft.com/en-us/azure/search/semantic-how-to-query-request?tabs=semanticConfiguration%2Cportal#create-a-semantic-configuration)

In [None]:
term_to_search_for = "Whose thumb was fractured?"

url = "https://{}.search.windows.net/indexes/{}/docs/search?api-version=2021-04-30-Preview".format(
    search_service_name, experiment_name
)
jdata = requests.post(url, json={"search": term_to_search_for, "queryType": "semantic", "semanticConfiguration": "config", "queryLanguage": "en-us", "answers": "extractive|count-3",
"captions": "extractive|highlight-true",  "highlightPreTag": "<mark>","highlightPostTag": "</mark>"}, headers={"api-key": search_query_key}).json()

for doc in jdata['value']:
    display(Markdown(f'**Search Score {doc["@search.score"]}** **Search rerankerScore Score {doc["@search.rerankerScore"]}** Document {doc["filename"]}'))
    display(Markdown(f'@search.captions {doc["@search.captions"]}'))


# Optional:  Build the Concept Graph using GraphFrames

## Optional: Amend this section to build your concept graph

In [None]:
lst_text = df_graph.select('value').rdd.flatMap(lambda x: x).collect()

lst_source_node = []
lst_source_node_weight = []
lst_source_node_label = []
lst_target_node = []
lst_target_node_weight = []
lst_target_node_label = []
lst_source_url = []
lst_target_url = []
lst_edge_weight = []
lst_edge_colour_weight = []

lst_g_nodes = []
lst_g_edges = []

dict_nodes = {}

from itertools import combinations

for i, row in enumerate(lst_text):
    
    combos = list(combinations(row, 2))
   
    for c in combos:
        # First update edge weights
        if (c[0] + "_" + c[1] not in dict_nodes) and (c[1] + "_" + c[0] not in dict_nodes):
            dict_nodes[c[0] + "_" + c[1]] = 1 # initialise and create first combo
        elif c[0] + "_" + c[1] in dict_nodes:
            dict_nodes[c[0] + "_" + c[1]] += 1
        elif c[1] + "_" + c[0] in dict_nodes:
            dict_nodes[c[1] + "_" + c[0]] += 1

    for c in combos:
        lst_source_node.append(c[0])
        lst_g_nodes.append((c[0], c[0]))
        lst_target_node.append((c[1]))
        lst_g_nodes.append((c[1], c[1]))
        if c[0] + "_" + c[1] in dict_nodes:
            lst_edge_weight.append(dict_nodes[c[0] + "_" + c[1]])
            lst_source_node_weight.append(dict_nodes[c[0] + "_" + c[1]])
            lst_target_node_weight.append(dict_nodes[c[0] + "_" + c[1]])
            lst_g_edges.append((c[0],c[1], "related"))
        else:
            lst_edge_weight.append(dict_nodes[c[1] + "_" + c[0]])
            lst_source_node_weight.append(dict_nodes[c[1] + "_" + c[0]])
            lst_target_node_weight.append(dict_nodes[c[1] + "_" + c[0]])

topn = dict(sorted(dict_nodes.items(), key = itemgetter(1), reverse = True)[:number_of_connected_nodes])

# Assign edge weight colour
for key in zip(lst_source_node, lst_target_node):
    
    if key[0] + "_" + key[1] in topn or key[1] + "_" + key[0] in topn:
        lst_edge_colour_weight.append("red")
    else:
        lst_edge_colour_weight.append("black")


# Create the Graph RDD
columns = ['source', 'target', 'source_node_weight', 'target_node_weight', 'edge_weight', 'edge_colour']
df_concept_graph = spark.createDataFrame(zip(lst_source_node, lst_target_node, lst_source_node_weight, lst_target_node_weight, lst_edge_weight, lst_edge_colour_weight), columns)
  
# Create a Vertex DataFrame with unique ID column "id"
v = sqlContext.createDataFrame(lst_g_nodes, ["id", "name"])
# Create an Edge DataFrame with "src" and "dst" columns
e = sqlContext.createDataFrame(lst_g_edges, ["src", "dst", "relationship"])

## Show degree connectivity

In [None]:
from graphframes import GraphFrame
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
df_degree = g.inDegrees
df_degree.sort(['inDegree'], ascending=False).show()

## Run PageRank 

In [None]:
# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'relationship'").count()

# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

## Save the outcome with two operations (usual or coalesce)

In [None]:
df_concept_graph.write.mode('overwrite').options(header='true').csv(os.path.join(output_directory, output_filename[:-4] + "concept_graph.csv"))