Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# OpenAI and Clustering Demo in Azure Synapse/Trident using the BBC Sports Dataset

# To run this notebook, ensure you have run the deployment script in the Data Discovery Toolkit - [here](https://github.com/microsoft/Data-Discovery-Toolkit#if-you-do-not-have-a-synapse-workspace)

## This notebook will showcase the following:

- Extracting OpenAI embeddings against a dataframe
- Clustering the dataset using KMEANS against OpenAI vectors
- PCA Dimensionality Reduction for PowerBI Scatterplot visualisation against OpenAI vectors
- Using OpenAI to automatically label the clusters in a dataset based on most common themes
- OpenAI Retrieval Augmented Generation Pattern using Azure Cognitive Search and standard lucene keyword search
- OpenAI Retrieval Augmented Generation Pattern using Azure Cognitive Search and Semantic Search
- OpenAI Retrieval Augmented Generation Pattern using Azure Cognitive Search and Azure Synapse/Trident SQL
- Search Result evaluation using Jaccard distance measure
- Search Result evaluation using Manhattan distance measure
- Search Result evaluation using Jaccard distance measure
- Search Result evaluation using Euclidean distance measure
- Search Result evaluation using Chebysev distance measure
- Search Result evaluation using Cosine distance measure using spaCy vectors
- Search Result evaluation using spaCy similarity
- Search Result evaluation using roBERTa natural language entailment
- Search Result evaluation using Jaccard distance measure
- Search Result evaluation using the Spearman Ranking Correlation Coeeficient against spaCy vectors
- Search Result evaluation using the Weighted Pearson Correlation Coefficient against spaCy vectors
- Search Result evaluation using Levenshtein distance distance measure
- Find the most similar content to any input data in the dataset using OpenAI vectors
- Build a Knowledge Graph from the text in the dataset for PowerBI Network Navigator visualisation
- Show Knowledge Graph Connectivity Degrees
- Show Knowledge Graph Page Rank

- TODO - Generate SQL using Codex
- TODO - Run evaluation against OpenAI vectors




## This cell configures the spark session - Do not change (not needed for Trident)

In [45]:
%%configure -f
{
"conf": {
     "spark.rpc.message.maxSize": 1024,
     "spark.kryoserializer.buffer.max": "1024m"
   }
}

StatementMeta(, 30, -1, Finished, Available)


## These are the parameters that need to be changed to your values

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StopWordsRemover, PCA, RegexTokenizer
from pyspark.ml.clustering import LDA, KMeans, BisectingKMeans
from pyspark.ml import Pipeline
from pyspark import SparkContext, SparkConf
import sys
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession
import ntpath
import os
import numpy as np
import openai
from math import sqrt
from scipy.stats import spearmanr
import torch
import torch.nn.functional as F


from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf,col

from itertools import combinations
from operator import itemgetter

from graphframes import *
from pyspark.sql.functions import monotonically_increasing_id, lit


import os
import openai
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType
from azure.core.credentials import AzureKeyCredential

import spacy
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from fuzzywuzzy import fuzz
from IPython.display import display, Markdown

# This will need to have been installed via the deployment script in the Data Discovery Toolkit
nlp = spacy.load('en_core_web_lg')

nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

nli_weights = sc.broadcast(nli_model)


# OpenAI deployment values - Add your values
openai_chatgpt_deployment = "chat" # Currently a gpt-35-turbo model
openai_gpt_deployment = "davinci" # Currently a text-davinci-003 model
openai_service = ""
openai_api_key = ""
openai_api_type = "azure"
openai_api_base = f"https://{openai_service}.openai.azure.com"
openai_api_version = "2022-12-01"
openai_embeddings_deployment = "" # Currently a text-embedding-ada-002 model
openai_code_deployment = "" # Currently a code-davinci-002 model

# If you have a RPM rate limit on your OpenAI account - sleep time in seconds
openai_sleep_time = 80

# The input file name - change to your BBC CSV file
input_filename = 'abfss://share@datadiscoverypipeline2.dfs.core.windows.net/bbcsports/csv/sport_articles.csv'

# The number of clusters - this can be automated or start with a guesstimate - set to 5 for BBC dataset
number_of_clusters = 5

# Concept Graph - # Get top N most connected nodes   
number_of_connected_nodes = 5


SCATTER_PLOT_3D = False


## Distance Measures for Evaluation

In [47]:
# Jaccard Distance
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Euclidean Distance
def euclidean(x, y):
    distance = 0
    for a, b in zip(x, y):
        distance += (sum([(pow((a-b),2))]))
    return sqrt(distance)

# Manhattan distance
def manhattan(x, y):
    distance=0
    for a,b in zip(x,y):
        distance += sum([abs(a-b)])
    return distance

#chebyshev distance
def chebyshev(x,y):
    distance = []
    for a,b in zip(x,y):
        distance.append(abs(a-b))
    return max(distance)

# Weighted Pearson Correlation Coefficient
def m(x, w):
    """Weighted Mean - Weighted Pearson Correlation Coefficient"""
    return np.sum(x * w) / np.sum(w)

def cov(x, y, w):
    """Weighted Covariance - Weighted Pearson Correlation Coefficient"""
    return np.sum(w * (x - m(x, w)) * (y - m(y, w))) / np.sum(w)

def corr(x, y, w):
    """Weighted Correlation - Weighted Pearson Correlation Coefficient"""
    return cov(x, y, w) / np.sqrt(cov(x, x, w) * cov(y, y, w))

def spacy_tokenizer(sentence):
    return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space)] # remove numbers 

StatementMeta(DataDiscovery, 30, 3, Finished, Available)

## Optional: Clustering - Apply Data Preprocessing, OpenAI Embeddings, Kmeans clustering and PCA dimensionality reduction

In [36]:
df = spark.read.load(input_filename, header=True, format='csv')

def openai_embeddings(text):

    from time import sleep
    sleep(openai_sleep_time) # Avoid rate limit if needed
    openai.api_key = openai_api_key
    openai.api_type = openai_api_type
    openai.api_base = openai_api_base
    openai.api_version = openai_api_version
    deployment_id = openai_embeddings_deployment

    embeddings = openai.Embedding.create(deployment_id=deployment_id,
                                     input=text[:2000]) # Take first 2000 characters or calculate tokens
    return Vectors.dense(embeddings['data'][0]['embedding'])

# Vectorise processed text to OpenAI features
udf_openai_vectoriser = udf(openai_embeddings, VectorUDT()) 
df = df.withColumn("OpenAI_features", udf_openai_vectoriser(col("text")))

# Apply PCA and Kmeans in a pipeline
pca = PCA(k=20, inputCol="OpenAI_features")
pca.setOutputCol("features")

if SCATTER_PLOT_3D:
  pca_2 = PCA(k=3, inputCol="features")
else:  
  pca_2 = PCA(k=2, inputCol="features")
  
pca_2.setOutputCol("pca_scatterplot_features")

kmeans = KMeans(k=number_of_clusters, seed=42, initMode="k-means||", distanceMeasure="euclidean")

pipeline = Pipeline(stages=[ pca, kmeans, pca_2])
model = pipeline.fit(df)
df_coords = model.transform(df)
# Let's write the data to a SQL Table
df_coords.write.saveAsTable("openai_bbc_coords_vectors")

StatementMeta(DataDiscovery, 26, 3, Finished, Available)

## If clustering is not run with vectors then load the data directly to a SQL table

In [48]:
df = spark.read.load(input_filename, header=True, format='csv')
try:
    df.write.saveAsTable("openai_bbc_coords2")
except Exception as TableAlreadyExists:
    print(TableAlreadyExists)
df = spark.sql("SELECT * FROM openai_bbc_coords2 LIMIT 1000")
display(df)

StatementMeta(DataDiscovery, 30, 4, Finished, Available)

Table `openai_bbc_coords2` already exists.


DataFrame[_c0: string, filename: string, text: string]

## Run a test SQL query

In [49]:
term_to_search_for = "thumb"
df = spark.sql(f"SELECT * FROM openai_bbc_coords2 where text like '%{term_to_search_for}%'")
display(df.show())

StatementMeta(DataDiscovery, 30, 5, Finished, Available)

+---+--------------+--------------------+
|_c0|      filename|                text|
+---+--------------+--------------------+
|215|cricket093.txt|vaughan endures d...|
|303|  rugby059.txt|"robinson out of ...|
|543|  rugby130.txt|"celts savour gra...|
|605|  rugby008.txt|"thomas out of si...|
+---+--------------+--------------------+



None

# Optional: Populate these values if using Azure Cognitive Search

## Add Search Parameters

In [50]:
# Azure Search Admin Key
search_admin_key = ""
# The name of the search service
search_service_name = ""
# The Azure Search Query Key
search_query_key = ""
search_index = "bbc-index"
# This is the name of the semantic configuration on the search index
semantic_configuration_name = "config"

search_client = SearchClient(
    endpoint=f"https://{search_service_name}.search.windows.net",
    index_name=search_index,
    credential=AzureKeyCredential(search_admin_key))

StatementMeta(DataDiscovery, 30, 6, Finished, Available)

## This code creates an Azure Cognitive Search Index 

In [None]:
from synapse.ml.cognitive import *
from pyspark.sql.functions import monotonically_increasing_id, lit

df = df.drop("_c0")

(
    df.withColumn("key", monotonically_increasing_id().cast("string"))
    .withColumn("SearchAction", lit("upload"))
    .writeToAzureSearch(
        subscriptionKey=search_admin_key,
        actionCol="SearchAction",
        serviceName=search_service_name,
        indexName=search_index,  # Defaults to the notebook name
        keyCol="key",
    )
)

## Optional: Test the Azure Cognitive Search Query

In [12]:
r = search_client.search("Whose thumb was fractured?", top=5)

scores = {}

results = [doc["content"].replace("\n", "").replace("\r", "") for doc in r]
for i, doc in enumerate(results):
    print(doc)

StatementMeta(DataDiscovery, 27, 13, Finished, Available)

Thomas out of Six NationsWales captain Gareth Thomas has been ruled out of the rest of the Six Nations with a broken thumb.The full-back will have surgery on Monday after fracturing his thumb in the 24-18 win over France on Saturday. But Welsh legend Phil Bennett insisted Wales can cope without Thomas as they chase a first Grand Slam in 27 years. Bennett told BBC Sport: "Such is the spirit in the camp, they'll put Kevin Morgan at 15, Rhys Williams at wing and just carry on." Thomas will miss the match against Scotland on 13 March, and what promises to be a huge encounter against the Irish six days later. Bennett added: "It's a setback. He's a great captain, he leads from the front and the boys love him." Thomas was replaced at half-time by Williams as his side turned around a 15-6 deficit in Paris."With Gareth missing I would think Michael Owen will be our captain," said Wales coach Mike Ruddock. "He did a great job in the second half in France. He has been vice-captain all along throu

## Search the generated Azure Search Index via standard search and run evaluation

In [13]:
term_to_search_for = "Whose thumb was fractured?"

r = search_client.search(term_to_search_for, top=5)

scores = {}

results = [doc for doc in r]
for i, doc in enumerate(results):

    hyp = doc["content"].replace("\n", " ")
    display(Markdown(f'*Candidate answer: {hyp}*'))
    scores[i] = 0

    # roBERTA Entailment
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # run through model pre-trained on MNLI
    x = tokenizer.encode(term_to_search_for, hyp, return_tensors='pt',
                            truncation_strategy='only_first')
    logits = nli_model(x.to(device))[0]

    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true
    entail_contradiction_logits = logits[:, [0, 2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    prob_label_is_true = probs[:, 1]
    print(f"Entailment score {prob_label_is_true[0]}")
    scores[i] += prob_label_is_true[0]

    # Fuzz ratio (Levenschein Distance)
    score = fuzz.ratio(term_to_search_for, hyp)
    print(f"Fuzz ratio {score}")
    scores[i] += score

    # spaCy cosine similarity
    doc1 = nlp(term_to_search_for)
    doc2 = nlp(hyp)
    score = doc1.similarity(doc2)
    print(f"spaCy {score}")
    scores[i] += score

    # Jaccard Distance
    score = get_jaccard_sim(term_to_search_for, hyp)
    print(f"Jaccard distance {score}")
    scores[i] += score

    # Euclidean Distance
    score = (100 - euclidean(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
    print(f"Euclidean distance spaCy {score}")
    scores[i] += score

    # Manhattan Distance
    score = (100 - manhattan(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
    print(f"Manhattan distance spaCy {score}")
    scores[i] += score

    # Chebysev Distance
    score = (100 - chebyshev(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
    print(f"Chebyshev distance spaCy {score}")
    scores[i] += score

    # Spearman Ranking Correlation Coeeficient spaCy
    score, _ = spearmanr(nlp(term_to_search_for).vector, nlp(hyp).vector)
    print(f"Spearman Ranking Correlation Coeeficient spaCy {score}")
    scores[i] += score

    # Weighted Pearson Correlation Coefficient spaCy
    weights = nlp(hyp).vector.shape[0]
    score = corr(nlp(term_to_search_for).vector, nlp(hyp).vector, weights)
    print(f"Weighted Pearson Correlation Coefficient spaCy {score}")
    scores[i] += score

    print(f'**Total Custom Distance Measure Score {scores[i]}**')
    print(f'**Azure Cognitive Search Score {doc["@search.score"]}**')


StatementMeta(DataDiscovery, 27, 14, Finished, Available)

*Candidate answer: Thomas out of Six Nations  Wales captain Gareth Thomas has been ruled out of the rest of the Six Nations with a broken thumb.  The full-back will have surgery on Monday after fracturing his thumb in the 24-18 win over France on Saturday. But Welsh legend Phil Bennett insisted Wales can cope without Thomas as they chase a first Grand Slam in 27 years. Bennett told BBC Sport: "Such is the spirit in the camp, they'll put Kevin Morgan at 15, Rhys Williams at wing and just carry on." Thomas will miss the match against Scotland on 13 March, and what promises to be a huge encounter against the Irish six days later. Bennett added: "It's a setback. He's a great captain, he leads from the front and the boys love him." Thomas was replaced at half-time by Williams as his side turned around a 15-6 deficit in Paris.  "With Gareth missing I would think Michael Owen will be our captain," said Wales coach Mike Ruddock. "He did a great job in the second half in France. He has been vice-captain all along throughout the championship." Wales travel to Edinburgh to take on Scotland in a fortnight and then host Ireland in Cardiff in the final round of matches in what could be the Grand Slam and championship decider. Bennett, an inspirational fly-half for Llanelli and Wales in the 1970s, insisted the national team were entering a new golden period. "It was a great game and a magnificent result for Wales," Bennett told BBC Radio Five Live's Sportsweek programme.  "The way this young team are blending, the glory days are on their way back. "We couldn't get possession early on and France dominated and scored two tries. "Had they been ruthless, Wales could have gone into the interval 30 points down. But they didn't take their chances. "Wales defended fairly well but you cannot give that sort of quality ball to good sides. "The All Blacks would have been ruthless and buried us in the first half. But the character we showed in the second half was quite outstanding."  *



Entailment score 0.9160066246986389
Fuzz ratio 0
spaCy 0.4943371490776689
Jaccard distance 0.008771929824561403
Euclidean distance spaCy 0.7367202900069381
Manhattan distance spaCy -2.6262820645794274
Chebyshev distance spaCy 0.958489465713501
Spearman Ranking Correlation Coeeficient spaCy 0.4607033411482349
Weighted Pearson Correlation Coefficient spaCy 0.8178629731922643
**Total Custom Distance Measure Score 1.7666096687316895**
**Azure Cognitive Search Score 5.5757585**


*Candidate answer: Kenyon denies Robben Barca return  Chelsea chief executive Peter Kenyon has played down reports that Arjen Robben will return for the Champions League match against Barcelona.  "He's been responding well to treatment and started running on Friday, but we'll have to wait and see," he told BBC Five Live's Sportsweek. "We're looking to getting him back as soon as possible, but he'll be back when it's right for him and for us. "There's no plans at the moment around the Barcelona game." His comments contradict those of chiropractor Jean Pierre Meersseman who treated the Dutchman after he fractured his foot at the start of February. Robben had been expected to be out for six weeks, but Meersseman hinted that the winger could be fit for the vital Stamford Bridge game on 8 March. "I hope he can be back and I will try to help him make that happen," Meersseman told the Mail on Sunday. "I put everything right with Arjen's foot the last time I saw him 12 days ago. It was an obvious correction and easy to perform. "I know he was pleased with what I did and now that he is running again. I am due to see him one more time again in the next few days." Meersseman is the medical co-ordinator at Italian side AC Milan.  *

Entailment score 0.02319514751434326
Fuzz ratio 0
spaCy 0.5441085140968871
Jaccard distance 0.006535947712418301
Euclidean distance spaCy 0.7441788759878958
Manhattan distance spaCy -2.579768744409084
Chebyshev distance spaCy 0.959799485206604
Spearman Ranking Correlation Coeeficient spaCy 0.525475171946355
Weighted Pearson Correlation Coefficient spaCy 0.7852365235676927
**Total Custom Distance Measure Score 1.008760929107666**
**Azure Cognitive Search Score 4.4981585**


*Candidate answer: Vickery upbeat about arm injury  England prop Phil Vickery is staying positive despite a broken arm ruling him out of the RBS Six Nations.  The 28-year-old fractured the radius in his right forearm during Gloucester's 17-16 win over Bath on Saturday. He will undergo an operation on Monday and is expected to be out for at least six weeks. He said: "This isn't an injury that will stop me from working hard on the fitness elements and being around the lads." He added: "I've got the operation this afternoon and I could be back doing fitness work after a week." "As frustrating as it is, I've got to be positive."  After the game, Vickery spoke with Bath prop David Barnes, who also broke his arm recently. "I had a chat with David Barnes and it looks like a similar injury to him," he said. "He said he had the operation and he was back running after a week. "There's no doubt that I'm going to get involved and be around this place as soon as I can after the operation." Gloucester director of rugby Nigel Melville said: "Phil has broken his radius, which is the large bone in his forearm. "I don't really know how it happened, but Phil will definitely be out of action for at least six weeks. "I feel very sorry for him, as he has been in great shape. He really needed 80 minutes of rugby this weekend, and then this happened. Mentally, it must be very hard for him."  *

Entailment score 0.7979953289031982
Fuzz ratio 0
spaCy 0.5652431677987757
Jaccard distance 0.006211180124223602
Euclidean distance spaCy 0.7491913911921283
Manhattan distance spaCy -2.51705733448267
Chebyshev distance spaCy 0.9626625299453735
Spearman Ranking Correlation Coeeficient spaCy 0.541580239780442
Weighted Pearson Correlation Coefficient spaCy 0.7972035895995842
**Total Custom Distance Measure Score 1.9030301570892334**
**Azure Cognitive Search Score 4.2438555**


*Candidate answer: Cole refuses to blame van Persie  Ashley Cole has refused to blame Robin van Persie for leaving Arsenal with no fully-fit strikers for the FA Cup fifth round replay at Sheffield United.  Van Persie is suspended alongside Dennis Bergkamp and Jose Antonio Reyes after being sent off at Southampton when Arsenal had a numerical advantage. Thierry Henry is ruled out with an Achilles tendon injury but Cole said: "No-one is putting the blame on Robin. "It's just something that happens on the spur of the moment." Cole added: "I've done it before and I hope they didn't blame me for anything. "Of course he'll learn. I've been sent off a couple of times now and it's just one of those things when you go a bit crazy for one or two seconds. Freddie Ljungberg is likely to be used in an emergency striking role and will be partnered by either Arturo Lupoli, Quincy Owusu-Abeyie or Jeremie Aliadiere. Gunners boss Arsene Wenger said: "Freddie is an option but we need a second striker. "I have to decide whether it will be Aliadiere, Quincy or Lupoli who will start with him up front. Those three will be involved." Arsenal are also without winger Robert Pires, who sustained an ankle injury at St Mary's. Wenger added: "It doesn't look like anything is fractured, but it is a good ankle sprain. "It does not look like Pires will be ready for two to three weeks."  *

Entailment score 0.286815345287323
Fuzz ratio 0
spaCy 0.5782458348190921
Jaccard distance 0.0
Euclidean distance spaCy 0.7683413514889219
Manhattan distance spaCy -2.2697082729265095
Chebyshev distance spaCy 0.9633776330947876
Spearman Ranking Correlation Coeeficient spaCy 0.5522430249224991
Weighted Pearson Correlation Coefficient spaCy 0.8017443486362155
**Total Custom Distance Measure Score 1.68105947971344**
**Azure Cognitive Search Score 4.167914**


*Candidate answer: Wales get Williams fitness boost  Wales are hopeful that openside flanker Martyn Williams could be fit for Saturday's RBS 6 Nations championship opener against England in Cardiff.  Williams was expected to miss the match with a disc problem in his neck, but has been making a speedy recovery. "He will have tests in the next 48 hours and we are pretty optimistic he is getting there," Wales' team physiotherapist Mark Davies said. "It has been frustrating but he is on the mend, he has made good progress." Last week Williams, along with fellow flanker Colin Charvis - who is unlikely to play for at least a month while he recovers from a foot injury - was all but ruled out of the Millennium Stadium clash. With Williams initially thought to be struggling, the signs pointed towards Wales coach Mike Ruddock handing a first cap to former Wales Under-21 skipper Richie Pugh.  Cardiff Blues flanker Williams, 29, offers considerable experience and if he is declared fit then Ruddock might be tempted to include him in the back row. Charvis will be reviewed by the Wales medical staff next Monday, but Davies admitted that there was only an "outside chance" of him being fit to face France in Wales' third championship game on 26 February. Wales' other injury concern is Pugh's fellow Neath-Swansea Ospreys player Sonny Parker, as the centre has a trapped nerve in his neck. "Sonny's injury is still an issue," Davies said. "It is still painful and irritable. We will run the rule of thumb over him in the next couple of days." Ruddock will name his starting line-up for the England game at 1830 GMT on Tuesday evening, as Wales target their first victory in Cardiff over the world champions since 1993.  *

Entailment score 0.522243857383728
Fuzz ratio 0
spaCy 0.5226353966638094
Jaccard distance 0.010582010582010581
Euclidean distance spaCy 0.7444192930929813
Manhattan distance spaCy -2.5673284966498615
Chebyshev distance spaCy 0.9611926651000977
Spearman Ranking Correlation Coeeficient spaCy 0.48937921532461465
Weighted Pearson Correlation Coefficient spaCy 0.62481859178453
**Total Custom Distance Measure Score 1.307942509651184**
**Azure Cognitive Search Score 4.147021**


## Search the generated Azure Search Index with Semantic Search and run evaluation

1) [Enable Semantic Search](https://docs.microsoft.com/en-us/azure/search/semantic-search-overview#enable-semantic-search) on your search instance

2) [Configure Semantic Search](https://docs.microsoft.com/en-us/azure/search/semantic-how-to-query-request?tabs=semanticConfiguration%2Cportal#create-a-semantic-configuration)

In [30]:
term_to_search_for = "Whose thumb was fractured?"

r = search_client.search(term_to_search_for,
                                 #filter=filter,
                                 query_type="semantic",
                                 query_language="en-us",
                                 query_speller="lexicon",
                                 semantic_configuration_name=semantic_configuration_name,
                                 top=5)


scores = {}

results = [doc for doc in r]
for i, doc in enumerate(results):

    try:

        hyp = doc["content"].replace("\n", " ")
        display(Markdown(f'*Candidate answer: {hyp}*'))
        scores[i] = 0

        # roBERTA Entailment
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        # run through model pre-trained on MNLI
        x = tokenizer.encode(term_to_search_for, hyp, return_tensors='pt',
                                truncation_strategy='only_first')
        logits = nli_model(x.to(device))[0]

        # we throw away "neutral" (dim 1) and take the probability of
        # "entailment" (2) as the probability of the label being true
        entail_contradiction_logits = logits[:, [0, 2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        prob_label_is_true = probs[:, 1]
        print(f"Entailment score {prob_label_is_true[0]}")
        scores[i] += prob_label_is_true[0]

        # Fuzz ratio (Levenschein Distance)
        score = fuzz.ratio(term_to_search_for, hyp)
        print(f"Fuzz ratio {score}")
        scores[i] += score

        # spaCy cosine similarity
        doc1 = nlp(term_to_search_for)
        doc2 = nlp(hyp)
        score = doc1.similarity(doc2)
        print(f"spaCy {score}")
        scores[i] += score

        # Jaccard Distance
        score = get_jaccard_sim(term_to_search_for, hyp)
        print(f"Jaccard distance {score}")
        scores[i] += score

        # Euclidean Distance
        score = (100 - euclidean(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
        print(f"Euclidean distance spaCy {score}")
        scores[i] += score

        # Manhattan Distance
        score = (100 - manhattan(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
        print(f"Manhattan distance spaCy {score}")
        scores[i] += score

        # Chebysev Distance
        score = (100 - chebyshev(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
        print(f"Chebyshev distance spaCy {score}")
        scores[i] += score

        # Spearman Ranking Correlation Coeeficient spaCy
        score, _ = spearmanr(nlp(term_to_search_for).vector, nlp(hyp).vector)
        print(f"Spearman Ranking Correlation Coeeficient spaCy {score}")
        scores[i] += score

        # Weighted Pearson Correlation Coefficient spaCy
        weights = nlp(hyp).vector.shape[0]
        score = corr(nlp(term_to_search_for).vector, nlp(hyp).vector, weights)
        print(f"Weighted Pearson Correlation Coefficient spaCy {score}")
        scores[i] += score

        print(f'**Total Custom Distance Measure Score {scores[i]}**')
        print(f'**Azure Cognitive Search Score {doc["@search.score"]}**')

    except Exception as swallowed:
        print(f"An error occured on record {i} {swallowed}")
        continue

StatementMeta(DataDiscovery, 29, 9, Finished, Available)

*Candidate answer: Thomas out of Six Nations  Wales captain Gareth Thomas has been ruled out of the rest of the Six Nations with a broken thumb.  The full-back will have surgery on Monday after fracturing his thumb in the 24-18 win over France on Saturday. But Welsh legend Phil Bennett insisted Wales can cope without Thomas as they chase a first Grand Slam in 27 years. Bennett told BBC Sport: "Such is the spirit in the camp, they'll put Kevin Morgan at 15, Rhys Williams at wing and just carry on." Thomas will miss the match against Scotland on 13 March, and what promises to be a huge encounter against the Irish six days later. Bennett added: "It's a setback. He's a great captain, he leads from the front and the boys love him." Thomas was replaced at half-time by Williams as his side turned around a 15-6 deficit in Paris.  "With Gareth missing I would think Michael Owen will be our captain," said Wales coach Mike Ruddock. "He did a great job in the second half in France. He has been vice-captain all along throughout the championship." Wales travel to Edinburgh to take on Scotland in a fortnight and then host Ireland in Cardiff in the final round of matches in what could be the Grand Slam and championship decider. Bennett, an inspirational fly-half for Llanelli and Wales in the 1970s, insisted the national team were entering a new golden period. "It was a great game and a magnificent result for Wales," Bennett told BBC Radio Five Live's Sportsweek programme.  "The way this young team are blending, the glory days are on their way back. "We couldn't get possession early on and France dominated and scored two tries. "Had they been ruthless, Wales could have gone into the interval 30 points down. But they didn't take their chances. "Wales defended fairly well but you cannot give that sort of quality ball to good sides. "The All Blacks would have been ruthless and buried us in the first half. But the character we showed in the second half was quite outstanding."  *



Entailment score 0.9160066246986389
Fuzz ratio 0
spaCy 0.4943371490776689
Jaccard distance 0.008771929824561403
Euclidean distance spaCy 0.7367202900069381
Manhattan distance spaCy -2.6262820645794274
Chebyshev distance spaCy 0.958489465713501
Spearman Ranking Correlation Coeeficient spaCy 0.4607033411482349
Weighted Pearson Correlation Coefficient spaCy 0.8178629731922643
**Total Custom Distance Measure Score 1.7666096687316895**
**Azure Cognitive Search Score 5.5757585**


*Candidate answer: Vickery upbeat about arm injury  England prop Phil Vickery is staying positive despite a broken arm ruling him out of the RBS Six Nations.  The 28-year-old fractured the radius in his right forearm during Gloucester's 17-16 win over Bath on Saturday. He will undergo an operation on Monday and is expected to be out for at least six weeks. He said: "This isn't an injury that will stop me from working hard on the fitness elements and being around the lads." He added: "I've got the operation this afternoon and I could be back doing fitness work after a week." "As frustrating as it is, I've got to be positive."  After the game, Vickery spoke with Bath prop David Barnes, who also broke his arm recently. "I had a chat with David Barnes and it looks like a similar injury to him," he said. "He said he had the operation and he was back running after a week. "There's no doubt that I'm going to get involved and be around this place as soon as I can after the operation." Gloucester director of rugby Nigel Melville said: "Phil has broken his radius, which is the large bone in his forearm. "I don't really know how it happened, but Phil will definitely be out of action for at least six weeks. "I feel very sorry for him, as he has been in great shape. He really needed 80 minutes of rugby this weekend, and then this happened. Mentally, it must be very hard for him."  *

Entailment score 0.7979953289031982
Fuzz ratio 0
spaCy 0.5652431677987757
Jaccard distance 0.006211180124223602
Euclidean distance spaCy 0.7491913911921283
Manhattan distance spaCy -2.51705733448267
Chebyshev distance spaCy 0.9626625299453735
Spearman Ranking Correlation Coeeficient spaCy 0.541580239780442
Weighted Pearson Correlation Coefficient spaCy 0.7972035895995842
**Total Custom Distance Measure Score 1.9030301570892334**
**Azure Cognitive Search Score 4.2438555**


*Candidate answer: Costin aims for comeback in 2006  Jamie Costin should be paralysed.  He says so himself in a matter-of-fact way as he recalls the car accident which occurred nine days before he was scheduled to step out into the Olympic Stadium in Athens for the 50K Walk. There is an ironic chuckle as he talks of his immediate thoughts after a lorry, driving on the wrong side of the road, had ploughed into his rental car. "I was in a lot of pain and I guessed that one of my toes was broken," says the Waterford man. "But I was thinking maybe with a cortisone injection you never know. "In my back, it felt as though all the muscles had been ripped off my pelvis but I was thinking maybe we could do something with laser therapy and ultra sound and hopefully I'd be able to race." It took over 10 hours before Jamie knew with certainty that he would not be competing in his second Olympics. "My back had been broken in two places and with one of my vertebrae, the bottom part had exploded so I'm fierce lucky not be paralysed. "I'd fractured my big toe as well which was on the brake." Jamie didn't finally arrive at hospital in Athens until some nine and a half hours after the accident.  "For the first nine hours, I had no pain killers which was ridiculous in 35 degrees heat. "But once I got the scans and saw them it was a case of moving on and thinking:'OK, I've got a different set of circumstances now'." Within three days he was arriving back in Ireland by air ambulance. Doctors in Athens had wanted to operate on Jamie's back immediately but he insisted on delaying any surgery until he arrived back home - something he is now very relieved about. "The Greek doctors were going to put three or four inch titanium rods either side of my spinal cord up through my vertebrae. "That would have fused all my lower back and I would never have been able to race again. They were really putting a lot of pressure on me to agree to the surgery. "But when I got to the Mater in Dublin they said it was possible for it to heal totally naturally which is giving me the chance to get back into competition which is very important to me. The people at the Mater have been absolutely fantastic." Jamie had to wear a body cast for three and a half months after the accident and spent most of that time flat on his back.  He then progressed to crutches for six weeks until he was finally able to walk unaided on 10 January. "Walking without the crutches seemed like something finally really measurable in terms of my recovery." Physio sessions with Johnston McEvoy in Limerick have been a vital part of his recovery. "Johnston uses an advanced type of acupuncture and it's very effective. "Needles get put right close up to my spine. A two and a half inch needle went in yesterday and I'm fairly incapacitated today as a result." Jamie has also travelled to receive treatment at the Polish training centre in Spala where he has trained with triple Olympic champion Robert Korzeniowski over the past five years. "I was there for over a fortnight earlier this month and underwent a fair extreme treatment called cryotherapy. "Basically, there's a small room which is cooled by liquid nitrogen to minus 160 degrees centigrade and it promotes deep healing."  Jamie heads to Poland again on Sunday where he will be having daily cryotherapy in addition to twice-daily physio sessions and pool-work. All these sessions are small steps on the way to what Jamie hopes will be a return to racing in 2006. "It's all about trying to get mobility in my back. Lying down for three and a half months didn't really help with the strength. "There's a lot of work involved in my recovery. I'm doing about six hours a day between physio and pool work. "I'm also going to the gym to lift very light weights to try and build up my muscles. I'm fairly full on with everything I do. "I'd hope to be training regularly by March. But training is just part of the process of getting back. "At the moment, every time I go and do a big bit of movement, my whole pelvic area all down my lower back just tightens up. "It's a case of waiting and seeing how it reacts. Hopefully, after four or five months my back won't tighten up as much."  *

Entailment score 0.796183168888092
Fuzz ratio 0
spaCy 0.5705528714398947
Jaccard distance 0.002457002457002457
Euclidean distance spaCy 0.743952954405338
Manhattan distance spaCy -2.4962075752019883
Chebyshev distance spaCy 0.9516753149032593
Spearman Ranking Correlation Coeeficient spaCy 0.5492176579739774
Weighted Pearson Correlation Coefficient spaCy 0.7938019818444078
**Total Custom Distance Measure Score 1.9116334915161133**
**Azure Cognitive Search Score 2.8234682**


*Candidate answer: Owen set for skipper role  Wales number eight Michael Owen says replacing Gareth Thomas as Wales' captain will be straightforward because of the leadership quality in the squad.  "You dream about playing for Wales, but it never enters your head that you will one day captain your country," he said. "It's an easy job really because there are so many experienced players in the side, but it will be a massive honour." Coach Mike Ruddock says the 24-year-old should take the armband from broken thumb victim Thomas against Scotland. Ruddock praised Owen's contribution as vice-captain and pack leader in this year's wins over England, Italy and France, notably his role in the second-half in Paris following Thomas' injury. The 22-cap Dragons man says that Thomas will be badly missed, though. "Gareth is going to be a massive loss because he is a brilliant player and he has done a fantastic job since he has been captain," said Owen. "He has put his own unique stamp on things. We are going to miss him but the strength of the squad is shown when you have someone like Kevin Morgan to step in."  Owen admitted the Grand Slam had been mentioned within the squad but that no-one was getting carried away with the three victories over England, Italy and France. "We can't get ahead of ourselves," he said. "We have a massive game next up and we are just looking forward to preparing for that. "Scotland are improving all the time. Like us they did well in France and were unlucky not to get a win. "Against Ireland they played fantastically well in the first 15 minutes before falling away. "They are probably going on a similar journey to the one we have been on in the last couple of years."  *

Entailment score 0.8652717471122742
Fuzz ratio 0
spaCy 0.5380841791349585
Jaccard distance 0.010101010101010102
Euclidean distance spaCy 0.7410224853019125
Manhattan distance spaCy -2.5730651529179887
Chebyshev distance spaCy 0.9595381164550781
Spearman Ranking Correlation Coeeficient spaCy 0.5125963621818019
Weighted Pearson Correlation Coefficient spaCy 0.8215269542644464
**Total Custom Distance Measure Score 1.8750755786895752**
**Azure Cognitive Search Score 3.8952117**


*Candidate answer: Aussies unhappy with pitch  Australian skipper Ricky Ponting was still able to raise a smile despite his side's 13-run defeat by India in the fourth Test at Mumbai.  They had already done enough to win the Border-Gavaskar Trophy with victories at Bangalore and Nagpur, their first series success in India since 1969-70. "I said I thought it would be one of the all-time great series and the first three Tests turned out that way. "This was nowhere near a Test pitch. It was terribly disappointing." He was less than thrilled, however, with a pitch in Mumbai which saw 38 wickets fall on the second and third days to bring the match to an early resolution.  Australia only needed 107 in the final innings but were bowled out for 93 in just 30.5 overs as the Indian spinners made the most of conditions. "It was definitely gettable, but I don't think we played at our best with the bat today. "Full credit to India though. They batted well in tough conditions and when the game was there to be won with the ball, they did what they had to do." Groundsman Polly Umrigar defended his pitch, saying: "My verdict is that the batsmen did not apply themselves, "The wicket was not as difficult as it is being made out to be. Batsmen willing to play shots got ample opportunities." Ponting also paid tribute to vice-captain Adam Gilchrist, who led the side in the first three Tests while he was recuperating from a broken thumb. "When you look back through the series, everyone has contributed in one way or another. That's a pleasing thing - it's what you need in a good side," he added.  "It's been a great tour. I've only been here a short time but we've certainly enjoyed ourselves - I think all of the players really enjoy touring India. "It's a place we've been to quite a lot over the last few years. Every time we come back we enjoy it more and more and it certainly helps when you win as well." Michael Clarke took 6-9 on the final day and also contributed 400 runs in his first Test series. But the Man of the Series award went to team-mate Damien Martyn, who made 444 runs at an average of 55, although he ended it with a duck. "I'd have liked 10 more runs in the second innings today," Martyn joked. "We came over here to win and fight hard. It was a bit disappointing to finish off that way but we've won [the series] and the guiys have made a fantastic effort in the last month. "We always know it's going to be hard over here. It's always a true test for a batsman."  *

Entailment score 0.957051694393158
Fuzz ratio 0
spaCy 0.5333328745669992
Jaccard distance 0.0035460992907801418
Euclidean distance spaCy 0.7369176001588964
Manhattan distance spaCy -2.6169634030759337
Chebyshev distance spaCy 0.957062726020813
Spearman Ranking Correlation Coeeficient spaCy 0.5078425315836842
Weighted Pearson Correlation Coefficient spaCy 0.8025071997095297
**Total Custom Distance Measure Score 1.8812973499298096**
**Azure Cognitive Search Score 2.9979866**


# Now we implement the OpenAI Retrieval Augmented Generation Pattern using Azure Cognitive Search standard search and evaluate

In [32]:
openai.api_key = openai_api_key
openai.api_type = openai_api_type
openai.api_base = f"https://{openai_service}.openai.azure.com"
openai.api_version = openai_api_version

def query(user_input):
   
    prompt_prefix = """<|im_start|>system
    Let's work this out it a step by step to be sure we have the right answer

    Sources:
    {sources}
    

    <|im_end|>"""

    turn_prefix = """
    <|im_start|>user
    """

    turn_suffix = """
    <|im_end|>
    <|im_start|>assistant
    """

    prompt_history = turn_prefix

    history = []

    summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. Generate a search query based on the conversation and the new question. Source names are not good search terms to include in the search query.

    Summary:
    {summary}

    Question:
    {question}

    Search query:
    """

    content = ""

    # Exclude category, to simulate scenarios where there's a set of docs you can't see
    exclude_category = None

    if len(history) > 0:
        completion = openai.Completion.create(
            engine=openai_gpt_deployment,
            prompt=summary_prompt_template.format(summary="\n".join(history), question=user_input),
            temperature=0.9,
            max_tokens=320,
            stop=["\n"])
        search = completion.choices[0].text
        print(f"search {search}")
    else:
        search = user_input

        # Alternatively simply use search_client.search(q, top=3) if not using semantic search
        print("Searching:", search)
        print("-------------------")
        filter = "category ne '{}'".format(exclude_category.replace("'", "''")) if exclude_category else None
        r = search_client.search(search,
                                 filter=filter,
                                 top=5)

        content = [doc for doc in r]

    prompt = prompt_prefix.format(sources=content) + prompt_history + user_input + turn_suffix

    completion = openai.Completion.create(
        engine=openai_chatgpt_deployment,
        prompt=prompt,
        temperature=0.7,
        max_tokens=1024,
        stop=["<|im_end|>", "<|im_start|>"])

    prompt_history += user_input + turn_suffix + completion.choices[0].text + "\n<|im_end|>" + turn_prefix
    history.append("user: " + user_input)
    history.append("assistant: " + completion.choices[0].text)

    print("\n-------------------\n".join(history))
    print("\n-------------------\nPrompt:\n" + prompt)

    # Now let's evaluate the results

    scores = {}


    for i, doc in enumerate(content):

        try:

            hyp = doc["content"].replace("\n", " ")
            display(Markdown(f'*Candidate answer: {hyp}*'))
            scores[i] = 0

            # roBERTA Entailment
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # run through model pre-trained on MNLI
            x = tokenizer.encode(term_to_search_for, hyp, return_tensors='pt',
                                    truncation_strategy='only_first')
            logits = nli_model(x.to(device))[0]

            # we throw away "neutral" (dim 1) and take the probability of
            # "entailment" (2) as the probability of the label being true
            entail_contradiction_logits = logits[:, [0, 2]]
            probs = entail_contradiction_logits.softmax(dim=1)
            prob_label_is_true = probs[:, 1]
            print(f"Entailment score {prob_label_is_true[0]}")
            scores[i] += prob_label_is_true[0]

            # Fuzz ratio (Levenschein Distance)
            score = fuzz.ratio(term_to_search_for, hyp)
            print(f"Fuzz ratio {score}")
            scores[i] += score

            # spaCy cosine similarity
            doc1 = nlp(term_to_search_for)
            doc2 = nlp(hyp)
            score = doc1.similarity(doc2)
            print(f"spaCy {score}")
            scores[i] += score

            # Jaccard Distance
            score = get_jaccard_sim(term_to_search_for, hyp)
            print(f"Jaccard distance {score}")
            scores[i] += score

            # Euclidean Distance
            score = (100 - euclidean(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Euclidean distance spaCy {score}")
            scores[i] += score

            # Manhattan Distance
            score = (100 - manhattan(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Manhattan distance spaCy {score}")
            scores[i] += score

            # Chebysev Distance
            score = (100 - chebyshev(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Chebyshev distance spaCy {score}")
            scores[i] += score

            # Spearman Ranking Correlation Coeeficient spaCy
            score, _ = spearmanr(nlp(term_to_search_for).vector, nlp(hyp).vector)
            print(f"Spearman Ranking Correlation Coeeficient spaCy {score}")
            scores[i] += score

            # Weighted Pearson Correlation Coefficient spaCy
            weights = nlp(hyp).vector.shape[0]
            score = corr(nlp(term_to_search_for).vector, nlp(hyp).vector, weights)
            print(f"Weighted Pearson Correlation Coefficient spaCy {score}")
            scores[i] += score

            print(f'**Total Custom Distance Measure Score {scores[i]}**')
            print(f'**Azure Cognitive Search Score {doc["@search.score"]}**')

        except Exception as swallowed:
            print(f"An error occured on record {i} {swallowed}")
            continue


StatementMeta(DataDiscovery, 29, 11, Finished, Available)

## Run the actual query

In [33]:
query("Whose thumb was fractured?")

StatementMeta(DataDiscovery, 29, 12, Finished, Available)

Searching: Whose thumb was fractured?
-------------------
user: Whose thumb was fractured?
-------------------
assistant:  Gareth Thomas' thumb was fractured.

-------------------
Prompt:
<|im_start|>system
    Let's work this out it a step by step to be sure we have the right answer

    Sources:
    [{'content': 'Thomas out of Six Nations\n\nWales captain Gareth Thomas has been ruled out of the rest of the Six Nations with a broken thumb.\n\nThe full-back will have surgery on Monday after fracturing his thumb in the 24-18 win over France on Saturday. But Welsh legend Phil Bennett insisted Wales can cope without Thomas as they chase a first Grand Slam in 27 years. Bennett told BBC Sport: "Such is the spirit in the camp, they\'ll put Kevin Morgan at 15, Rhys Williams at wing and just carry on." Thomas will miss the match against Scotland on 13 March, and what promises to be a huge encounter against the Irish six days later. Bennett added: "It\'s a setback. He\'s a great captain, he lea

*Candidate answer: Thomas out of Six Nations  Wales captain Gareth Thomas has been ruled out of the rest of the Six Nations with a broken thumb.  The full-back will have surgery on Monday after fracturing his thumb in the 24-18 win over France on Saturday. But Welsh legend Phil Bennett insisted Wales can cope without Thomas as they chase a first Grand Slam in 27 years. Bennett told BBC Sport: "Such is the spirit in the camp, they'll put Kevin Morgan at 15, Rhys Williams at wing and just carry on." Thomas will miss the match against Scotland on 13 March, and what promises to be a huge encounter against the Irish six days later. Bennett added: "It's a setback. He's a great captain, he leads from the front and the boys love him." Thomas was replaced at half-time by Williams as his side turned around a 15-6 deficit in Paris.  "With Gareth missing I would think Michael Owen will be our captain," said Wales coach Mike Ruddock. "He did a great job in the second half in France. He has been vice-captain all along throughout the championship." Wales travel to Edinburgh to take on Scotland in a fortnight and then host Ireland in Cardiff in the final round of matches in what could be the Grand Slam and championship decider. Bennett, an inspirational fly-half for Llanelli and Wales in the 1970s, insisted the national team were entering a new golden period. "It was a great game and a magnificent result for Wales," Bennett told BBC Radio Five Live's Sportsweek programme.  "The way this young team are blending, the glory days are on their way back. "We couldn't get possession early on and France dominated and scored two tries. "Had they been ruthless, Wales could have gone into the interval 30 points down. But they didn't take their chances. "Wales defended fairly well but you cannot give that sort of quality ball to good sides. "The All Blacks would have been ruthless and buried us in the first half. But the character we showed in the second half was quite outstanding."  *



Entailment score 0.9160066246986389
Fuzz ratio 0
spaCy 0.4943371490776689
Jaccard distance 0.008771929824561403
Euclidean distance spaCy 0.7367202900069381
Manhattan distance spaCy -2.6262820645794274
Chebyshev distance spaCy 0.958489465713501
Spearman Ranking Correlation Coeeficient spaCy 0.4607033411482349
Weighted Pearson Correlation Coefficient spaCy 0.8178629731922643
**Total Custom Distance Measure Score 1.7666096687316895**
**Azure Cognitive Search Score 5.5757585**


*Candidate answer: Kenyon denies Robben Barca return  Chelsea chief executive Peter Kenyon has played down reports that Arjen Robben will return for the Champions League match against Barcelona.  "He's been responding well to treatment and started running on Friday, but we'll have to wait and see," he told BBC Five Live's Sportsweek. "We're looking to getting him back as soon as possible, but he'll be back when it's right for him and for us. "There's no plans at the moment around the Barcelona game." His comments contradict those of chiropractor Jean Pierre Meersseman who treated the Dutchman after he fractured his foot at the start of February. Robben had been expected to be out for six weeks, but Meersseman hinted that the winger could be fit for the vital Stamford Bridge game on 8 March. "I hope he can be back and I will try to help him make that happen," Meersseman told the Mail on Sunday. "I put everything right with Arjen's foot the last time I saw him 12 days ago. It was an obvious correction and easy to perform. "I know he was pleased with what I did and now that he is running again. I am due to see him one more time again in the next few days." Meersseman is the medical co-ordinator at Italian side AC Milan.  *

Entailment score 0.02319514751434326
Fuzz ratio 0
spaCy 0.5441085140968871
Jaccard distance 0.006535947712418301
Euclidean distance spaCy 0.7441788759878958
Manhattan distance spaCy -2.579768744409084
Chebyshev distance spaCy 0.959799485206604
Spearman Ranking Correlation Coeeficient spaCy 0.525475171946355
Weighted Pearson Correlation Coefficient spaCy 0.7852365235676927
**Total Custom Distance Measure Score 1.008760929107666**
**Azure Cognitive Search Score 4.4981585**


*Candidate answer: Vickery upbeat about arm injury  England prop Phil Vickery is staying positive despite a broken arm ruling him out of the RBS Six Nations.  The 28-year-old fractured the radius in his right forearm during Gloucester's 17-16 win over Bath on Saturday. He will undergo an operation on Monday and is expected to be out for at least six weeks. He said: "This isn't an injury that will stop me from working hard on the fitness elements and being around the lads." He added: "I've got the operation this afternoon and I could be back doing fitness work after a week." "As frustrating as it is, I've got to be positive."  After the game, Vickery spoke with Bath prop David Barnes, who also broke his arm recently. "I had a chat with David Barnes and it looks like a similar injury to him," he said. "He said he had the operation and he was back running after a week. "There's no doubt that I'm going to get involved and be around this place as soon as I can after the operation." Gloucester director of rugby Nigel Melville said: "Phil has broken his radius, which is the large bone in his forearm. "I don't really know how it happened, but Phil will definitely be out of action for at least six weeks. "I feel very sorry for him, as he has been in great shape. He really needed 80 minutes of rugby this weekend, and then this happened. Mentally, it must be very hard for him."  *

Entailment score 0.7979953289031982
Fuzz ratio 0
spaCy 0.5652431677987757
Jaccard distance 0.006211180124223602
Euclidean distance spaCy 0.7491913911921283
Manhattan distance spaCy -2.51705733448267
Chebyshev distance spaCy 0.9626625299453735
Spearman Ranking Correlation Coeeficient spaCy 0.541580239780442
Weighted Pearson Correlation Coefficient spaCy 0.7972035895995842
**Total Custom Distance Measure Score 1.9030301570892334**
**Azure Cognitive Search Score 4.2438555**


*Candidate answer: Cole refuses to blame van Persie  Ashley Cole has refused to blame Robin van Persie for leaving Arsenal with no fully-fit strikers for the FA Cup fifth round replay at Sheffield United.  Van Persie is suspended alongside Dennis Bergkamp and Jose Antonio Reyes after being sent off at Southampton when Arsenal had a numerical advantage. Thierry Henry is ruled out with an Achilles tendon injury but Cole said: "No-one is putting the blame on Robin. "It's just something that happens on the spur of the moment." Cole added: "I've done it before and I hope they didn't blame me for anything. "Of course he'll learn. I've been sent off a couple of times now and it's just one of those things when you go a bit crazy for one or two seconds. Freddie Ljungberg is likely to be used in an emergency striking role and will be partnered by either Arturo Lupoli, Quincy Owusu-Abeyie or Jeremie Aliadiere. Gunners boss Arsene Wenger said: "Freddie is an option but we need a second striker. "I have to decide whether it will be Aliadiere, Quincy or Lupoli who will start with him up front. Those three will be involved." Arsenal are also without winger Robert Pires, who sustained an ankle injury at St Mary's. Wenger added: "It doesn't look like anything is fractured, but it is a good ankle sprain. "It does not look like Pires will be ready for two to three weeks."  *

Entailment score 0.286815345287323
Fuzz ratio 0
spaCy 0.5782458348190921
Jaccard distance 0.0
Euclidean distance spaCy 0.7683413514889219
Manhattan distance spaCy -2.2697082729265095
Chebyshev distance spaCy 0.9633776330947876
Spearman Ranking Correlation Coeeficient spaCy 0.5522430249224991
Weighted Pearson Correlation Coefficient spaCy 0.8017443486362155
**Total Custom Distance Measure Score 1.68105947971344**
**Azure Cognitive Search Score 4.167914**


*Candidate answer: Wales get Williams fitness boost  Wales are hopeful that openside flanker Martyn Williams could be fit for Saturday's RBS 6 Nations championship opener against England in Cardiff.  Williams was expected to miss the match with a disc problem in his neck, but has been making a speedy recovery. "He will have tests in the next 48 hours and we are pretty optimistic he is getting there," Wales' team physiotherapist Mark Davies said. "It has been frustrating but he is on the mend, he has made good progress." Last week Williams, along with fellow flanker Colin Charvis - who is unlikely to play for at least a month while he recovers from a foot injury - was all but ruled out of the Millennium Stadium clash. With Williams initially thought to be struggling, the signs pointed towards Wales coach Mike Ruddock handing a first cap to former Wales Under-21 skipper Richie Pugh.  Cardiff Blues flanker Williams, 29, offers considerable experience and if he is declared fit then Ruddock might be tempted to include him in the back row. Charvis will be reviewed by the Wales medical staff next Monday, but Davies admitted that there was only an "outside chance" of him being fit to face France in Wales' third championship game on 26 February. Wales' other injury concern is Pugh's fellow Neath-Swansea Ospreys player Sonny Parker, as the centre has a trapped nerve in his neck. "Sonny's injury is still an issue," Davies said. "It is still painful and irritable. We will run the rule of thumb over him in the next couple of days." Ruddock will name his starting line-up for the England game at 1830 GMT on Tuesday evening, as Wales target their first victory in Cardiff over the world champions since 1993.  *

Entailment score 0.522243857383728
Fuzz ratio 0
spaCy 0.5226353966638094
Jaccard distance 0.010582010582010581
Euclidean distance spaCy 0.7444192930929813
Manhattan distance spaCy -2.5673284966498615
Chebyshev distance spaCy 0.9611926651000977
Spearman Ranking Correlation Coeeficient spaCy 0.48937921532461465
Weighted Pearson Correlation Coefficient spaCy 0.62481859178453
**Total Custom Distance Measure Score 1.307942509651184**
**Azure Cognitive Search Score 4.147021**


## Now we implement the OpenAI Retrieval Augmented Generation Pattern using Azure Cognitive Search Semantic search and evaluate

In [35]:
openai.api_key = openai_api_key
openai.api_type = openai_api_type
openai.api_base = f"https://{openai_service}.openai.azure.com"
openai.api_version = openai_api_version

def query_semantic(user_input):
   
    prompt_prefix = """<|im_start|>system
    Let's work this out it a step by step to be sure we have the right answer

    Sources:
    {sources}
    

    <|im_end|>"""

    turn_prefix = """
    <|im_start|>user
    """

    turn_suffix = """
    <|im_end|>
    <|im_start|>assistant
    """

    prompt_history = turn_prefix

    history = []

    summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. Generate a search query based on the conversation and the new question. Source names are not good search terms to include in the search query.

    Summary:
    {summary}

    Question:
    {question}

    Search query:
    """

    content = ""

    # Exclude category, to simulate scenarios where there's a set of docs you can't see
    exclude_category = None

    if len(history) > 0:
        completion = openai.Completion.create(
            engine=openai_gpt_deployment,
            prompt=summary_prompt_template.format(summary="\n".join(history), question=user_input),
            temperature=0.9,
            max_tokens=320,
            stop=["\n"])
        search = completion.choices[0].text
        print(f"search {search}")
    else:
        search = user_input

        # Alternatively simply use search_client.search(q, top=3) if not using semantic search
        print("Searching:", search)
        print("-------------------")
        filter = "category ne '{}'".format(exclude_category.replace("'", "''")) if exclude_category else None
        r = search_client.search(search,
                                 filter=filter,
                                 query_type="semantic",
                                 query_language="en-us",
                                 query_speller="lexicon",
                                 semantic_configuration_name=semantic_configuration_name,
                                 top=5)

        content = [doc for doc in r]

    prompt = prompt_prefix.format(sources=content) + prompt_history + user_input + turn_suffix

    completion = openai.Completion.create(
        engine=openai_chatgpt_deployment,
        prompt=prompt,
        temperature=0.7,
        max_tokens=1024,
        stop=["<|im_end|>", "<|im_start|>"])

    prompt_history += user_input + turn_suffix + completion.choices[0].text + "\n<|im_end|>" + turn_prefix
    history.append("user: " + user_input)
    history.append("assistant: " + completion.choices[0].text)

    print("\n-------------------\n".join(history))
    print("\n-------------------\nPrompt:\n" + prompt)

    # Now let's evaluate the results

    scores = {}


    for i, doc in enumerate(content):

        try:

            hyp = doc["content"].replace("\n", " ")
            display(Markdown(f'*Candidate answer: {hyp}*'))
            scores[i] = 0

            # roBERTA Entailment
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # run through model pre-trained on MNLI
            x = tokenizer.encode(term_to_search_for, hyp, return_tensors='pt',
                                    truncation_strategy='only_first')
            logits = nli_model(x.to(device))[0]

            # we throw away "neutral" (dim 1) and take the probability of
            # "entailment" (2) as the probability of the label being true
            entail_contradiction_logits = logits[:, [0, 2]]
            probs = entail_contradiction_logits.softmax(dim=1)
            prob_label_is_true = probs[:, 1]
            print(f"Entailment score {prob_label_is_true[0]}")
            scores[i] += prob_label_is_true[0]

            # Fuzz ratio (Levenschein Distance)
            score = fuzz.ratio(term_to_search_for, hyp)
            print(f"Fuzz ratio {score}")
            scores[i] += score

            # spaCy cosine similarity
            doc1 = nlp(term_to_search_for)
            doc2 = nlp(hyp)
            score = doc1.similarity(doc2)
            print(f"spaCy {score}")
            scores[i] += score

            # Jaccard Distance
            score = get_jaccard_sim(term_to_search_for, hyp)
            print(f"Jaccard distance {score}")
            scores[i] += score

            # Euclidean Distance
            score = (100 - euclidean(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Euclidean distance spaCy {score}")
            scores[i] += score

            # Manhattan Distance
            score = (100 - manhattan(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Manhattan distance spaCy {score}")
            scores[i] += score

            # Chebysev Distance
            score = (100 - chebyshev(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Chebyshev distance spaCy {score}")
            scores[i] += score

            # Spearman Ranking Correlation Coeeficient spaCy
            score, _ = spearmanr(nlp(term_to_search_for).vector, nlp(hyp).vector)
            print(f"Spearman Ranking Correlation Coeeficient spaCy {score}")
            scores[i] += score

            # Weighted Pearson Correlation Coefficient spaCy
            weights = nlp(hyp).vector.shape[0]
            score = corr(nlp(term_to_search_for).vector, nlp(hyp).vector, weights)
            print(f"Weighted Pearson Correlation Coefficient spaCy {score}")
            scores[i] += score

            print(f'**Total Custom Distance Measure Score {scores[i]}**')
            print(f'**Azure Cognitive Search Score {doc["@search.score"]}**')

        except Exception as swallowed:
            print(f"An error occured on record {i} {swallowed}")
            continue


StatementMeta(DataDiscovery, 29, 14, Finished, Available)

## Run the actual query

In [36]:
query_semantic("Whose thumb was fractured?")

StatementMeta(DataDiscovery, 29, 15, Finished, Available)

Searching: Whose thumb was fractured?
-------------------
user: Whose thumb was fractured?
-------------------
assistant:  Gareth Thomas' thumb was fractured.

-------------------
Prompt:
<|im_start|>system
    Let's work this out it a step by step to be sure we have the right answer

    Sources:
    [{'content': 'Thomas out of Six Nations\n\nWales captain Gareth Thomas has been ruled out of the rest of the Six Nations with a broken thumb.\n\nThe full-back will have surgery on Monday after fracturing his thumb in the 24-18 win over France on Saturday. But Welsh legend Phil Bennett insisted Wales can cope without Thomas as they chase a first Grand Slam in 27 years. Bennett told BBC Sport: "Such is the spirit in the camp, they\'ll put Kevin Morgan at 15, Rhys Williams at wing and just carry on." Thomas will miss the match against Scotland on 13 March, and what promises to be a huge encounter against the Irish six days later. Bennett added: "It\'s a setback. He\'s a great captain, he lea

*Candidate answer: Thomas out of Six Nations  Wales captain Gareth Thomas has been ruled out of the rest of the Six Nations with a broken thumb.  The full-back will have surgery on Monday after fracturing his thumb in the 24-18 win over France on Saturday. But Welsh legend Phil Bennett insisted Wales can cope without Thomas as they chase a first Grand Slam in 27 years. Bennett told BBC Sport: "Such is the spirit in the camp, they'll put Kevin Morgan at 15, Rhys Williams at wing and just carry on." Thomas will miss the match against Scotland on 13 March, and what promises to be a huge encounter against the Irish six days later. Bennett added: "It's a setback. He's a great captain, he leads from the front and the boys love him." Thomas was replaced at half-time by Williams as his side turned around a 15-6 deficit in Paris.  "With Gareth missing I would think Michael Owen will be our captain," said Wales coach Mike Ruddock. "He did a great job in the second half in France. He has been vice-captain all along throughout the championship." Wales travel to Edinburgh to take on Scotland in a fortnight and then host Ireland in Cardiff in the final round of matches in what could be the Grand Slam and championship decider. Bennett, an inspirational fly-half for Llanelli and Wales in the 1970s, insisted the national team were entering a new golden period. "It was a great game and a magnificent result for Wales," Bennett told BBC Radio Five Live's Sportsweek programme.  "The way this young team are blending, the glory days are on their way back. "We couldn't get possession early on and France dominated and scored two tries. "Had they been ruthless, Wales could have gone into the interval 30 points down. But they didn't take their chances. "Wales defended fairly well but you cannot give that sort of quality ball to good sides. "The All Blacks would have been ruthless and buried us in the first half. But the character we showed in the second half was quite outstanding."  *



Entailment score 0.9160066246986389
Fuzz ratio 0
spaCy 0.4943371490776689
Jaccard distance 0.008771929824561403
Euclidean distance spaCy 0.7367202900069381
Manhattan distance spaCy -2.6262820645794274
Chebyshev distance spaCy 0.958489465713501
Spearman Ranking Correlation Coeeficient spaCy 0.4607033411482349
Weighted Pearson Correlation Coefficient spaCy 0.8178629731922643
**Total Custom Distance Measure Score 1.7666096687316895**
**Azure Cognitive Search Score 5.5757585**


*Candidate answer: Vickery upbeat about arm injury  England prop Phil Vickery is staying positive despite a broken arm ruling him out of the RBS Six Nations.  The 28-year-old fractured the radius in his right forearm during Gloucester's 17-16 win over Bath on Saturday. He will undergo an operation on Monday and is expected to be out for at least six weeks. He said: "This isn't an injury that will stop me from working hard on the fitness elements and being around the lads." He added: "I've got the operation this afternoon and I could be back doing fitness work after a week." "As frustrating as it is, I've got to be positive."  After the game, Vickery spoke with Bath prop David Barnes, who also broke his arm recently. "I had a chat with David Barnes and it looks like a similar injury to him," he said. "He said he had the operation and he was back running after a week. "There's no doubt that I'm going to get involved and be around this place as soon as I can after the operation." Gloucester director of rugby Nigel Melville said: "Phil has broken his radius, which is the large bone in his forearm. "I don't really know how it happened, but Phil will definitely be out of action for at least six weeks. "I feel very sorry for him, as he has been in great shape. He really needed 80 minutes of rugby this weekend, and then this happened. Mentally, it must be very hard for him."  *

Entailment score 0.7979953289031982
Fuzz ratio 0
spaCy 0.5652431677987757
Jaccard distance 0.006211180124223602
Euclidean distance spaCy 0.7491913911921283
Manhattan distance spaCy -2.51705733448267
Chebyshev distance spaCy 0.9626625299453735
Spearman Ranking Correlation Coeeficient spaCy 0.541580239780442
Weighted Pearson Correlation Coefficient spaCy 0.7972035895995842
**Total Custom Distance Measure Score 1.9030301570892334**
**Azure Cognitive Search Score 4.2438555**


*Candidate answer: Costin aims for comeback in 2006  Jamie Costin should be paralysed.  He says so himself in a matter-of-fact way as he recalls the car accident which occurred nine days before he was scheduled to step out into the Olympic Stadium in Athens for the 50K Walk. There is an ironic chuckle as he talks of his immediate thoughts after a lorry, driving on the wrong side of the road, had ploughed into his rental car. "I was in a lot of pain and I guessed that one of my toes was broken," says the Waterford man. "But I was thinking maybe with a cortisone injection you never know. "In my back, it felt as though all the muscles had been ripped off my pelvis but I was thinking maybe we could do something with laser therapy and ultra sound and hopefully I'd be able to race." It took over 10 hours before Jamie knew with certainty that he would not be competing in his second Olympics. "My back had been broken in two places and with one of my vertebrae, the bottom part had exploded so I'm fierce lucky not be paralysed. "I'd fractured my big toe as well which was on the brake." Jamie didn't finally arrive at hospital in Athens until some nine and a half hours after the accident.  "For the first nine hours, I had no pain killers which was ridiculous in 35 degrees heat. "But once I got the scans and saw them it was a case of moving on and thinking:'OK, I've got a different set of circumstances now'." Within three days he was arriving back in Ireland by air ambulance. Doctors in Athens had wanted to operate on Jamie's back immediately but he insisted on delaying any surgery until he arrived back home - something he is now very relieved about. "The Greek doctors were going to put three or four inch titanium rods either side of my spinal cord up through my vertebrae. "That would have fused all my lower back and I would never have been able to race again. They were really putting a lot of pressure on me to agree to the surgery. "But when I got to the Mater in Dublin they said it was possible for it to heal totally naturally which is giving me the chance to get back into competition which is very important to me. The people at the Mater have been absolutely fantastic." Jamie had to wear a body cast for three and a half months after the accident and spent most of that time flat on his back.  He then progressed to crutches for six weeks until he was finally able to walk unaided on 10 January. "Walking without the crutches seemed like something finally really measurable in terms of my recovery." Physio sessions with Johnston McEvoy in Limerick have been a vital part of his recovery. "Johnston uses an advanced type of acupuncture and it's very effective. "Needles get put right close up to my spine. A two and a half inch needle went in yesterday and I'm fairly incapacitated today as a result." Jamie has also travelled to receive treatment at the Polish training centre in Spala where he has trained with triple Olympic champion Robert Korzeniowski over the past five years. "I was there for over a fortnight earlier this month and underwent a fair extreme treatment called cryotherapy. "Basically, there's a small room which is cooled by liquid nitrogen to minus 160 degrees centigrade and it promotes deep healing."  Jamie heads to Poland again on Sunday where he will be having daily cryotherapy in addition to twice-daily physio sessions and pool-work. All these sessions are small steps on the way to what Jamie hopes will be a return to racing in 2006. "It's all about trying to get mobility in my back. Lying down for three and a half months didn't really help with the strength. "There's a lot of work involved in my recovery. I'm doing about six hours a day between physio and pool work. "I'm also going to the gym to lift very light weights to try and build up my muscles. I'm fairly full on with everything I do. "I'd hope to be training regularly by March. But training is just part of the process of getting back. "At the moment, every time I go and do a big bit of movement, my whole pelvic area all down my lower back just tightens up. "It's a case of waiting and seeing how it reacts. Hopefully, after four or five months my back won't tighten up as much."  *

Entailment score 0.796183168888092
Fuzz ratio 0
spaCy 0.5705528714398947
Jaccard distance 0.002457002457002457
Euclidean distance spaCy 0.743952954405338
Manhattan distance spaCy -2.4962075752019883
Chebyshev distance spaCy 0.9516753149032593
Spearman Ranking Correlation Coeeficient spaCy 0.5492176579739774
Weighted Pearson Correlation Coefficient spaCy 0.7938019818444078
**Total Custom Distance Measure Score 1.9116334915161133**
**Azure Cognitive Search Score 2.8234682**


*Candidate answer: Owen set for skipper role  Wales number eight Michael Owen says replacing Gareth Thomas as Wales' captain will be straightforward because of the leadership quality in the squad.  "You dream about playing for Wales, but it never enters your head that you will one day captain your country," he said. "It's an easy job really because there are so many experienced players in the side, but it will be a massive honour." Coach Mike Ruddock says the 24-year-old should take the armband from broken thumb victim Thomas against Scotland. Ruddock praised Owen's contribution as vice-captain and pack leader in this year's wins over England, Italy and France, notably his role in the second-half in Paris following Thomas' injury. The 22-cap Dragons man says that Thomas will be badly missed, though. "Gareth is going to be a massive loss because he is a brilliant player and he has done a fantastic job since he has been captain," said Owen. "He has put his own unique stamp on things. We are going to miss him but the strength of the squad is shown when you have someone like Kevin Morgan to step in."  Owen admitted the Grand Slam had been mentioned within the squad but that no-one was getting carried away with the three victories over England, Italy and France. "We can't get ahead of ourselves," he said. "We have a massive game next up and we are just looking forward to preparing for that. "Scotland are improving all the time. Like us they did well in France and were unlucky not to get a win. "Against Ireland they played fantastically well in the first 15 minutes before falling away. "They are probably going on a similar journey to the one we have been on in the last couple of years."  *

Entailment score 0.8652717471122742
Fuzz ratio 0
spaCy 0.5380841791349585
Jaccard distance 0.010101010101010102
Euclidean distance spaCy 0.7410224853019125
Manhattan distance spaCy -2.5730651529179887
Chebyshev distance spaCy 0.9595381164550781
Spearman Ranking Correlation Coeeficient spaCy 0.5125963621818019
Weighted Pearson Correlation Coefficient spaCy 0.8215269542644464
**Total Custom Distance Measure Score 1.8750755786895752**
**Azure Cognitive Search Score 3.8952117**


*Candidate answer: Aussies unhappy with pitch  Australian skipper Ricky Ponting was still able to raise a smile despite his side's 13-run defeat by India in the fourth Test at Mumbai.  They had already done enough to win the Border-Gavaskar Trophy with victories at Bangalore and Nagpur, their first series success in India since 1969-70. "I said I thought it would be one of the all-time great series and the first three Tests turned out that way. "This was nowhere near a Test pitch. It was terribly disappointing." He was less than thrilled, however, with a pitch in Mumbai which saw 38 wickets fall on the second and third days to bring the match to an early resolution.  Australia only needed 107 in the final innings but were bowled out for 93 in just 30.5 overs as the Indian spinners made the most of conditions. "It was definitely gettable, but I don't think we played at our best with the bat today. "Full credit to India though. They batted well in tough conditions and when the game was there to be won with the ball, they did what they had to do." Groundsman Polly Umrigar defended his pitch, saying: "My verdict is that the batsmen did not apply themselves, "The wicket was not as difficult as it is being made out to be. Batsmen willing to play shots got ample opportunities." Ponting also paid tribute to vice-captain Adam Gilchrist, who led the side in the first three Tests while he was recuperating from a broken thumb. "When you look back through the series, everyone has contributed in one way or another. That's a pleasing thing - it's what you need in a good side," he added.  "It's been a great tour. I've only been here a short time but we've certainly enjoyed ourselves - I think all of the players really enjoy touring India. "It's a place we've been to quite a lot over the last few years. Every time we come back we enjoy it more and more and it certainly helps when you win as well." Michael Clarke took 6-9 on the final day and also contributed 400 runs in his first Test series. But the Man of the Series award went to team-mate Damien Martyn, who made 444 runs at an average of 55, although he ended it with a duck. "I'd have liked 10 more runs in the second innings today," Martyn joked. "We came over here to win and fight hard. It was a bit disappointing to finish off that way but we've won [the series] and the guiys have made a fantastic effort in the last month. "We always know it's going to be hard over here. It's always a true test for a batsman."  *

Entailment score 0.957051694393158
Fuzz ratio 0
spaCy 0.5333328745669992
Jaccard distance 0.0035460992907801418
Euclidean distance spaCy 0.7369176001588964
Manhattan distance spaCy -2.6169634030759337
Chebyshev distance spaCy 0.957062726020813
Spearman Ranking Correlation Coeeficient spaCy 0.5078425315836842
Weighted Pearson Correlation Coefficient spaCy 0.8025071997095297
**Total Custom Distance Measure Score 1.8812973499298096**
**Azure Cognitive Search Score 2.9979866**


## Now we implement the OpenAI Retrieval Augmented Generation Pattern using simple SQL Synapse/Trident queries and evaluate

In [53]:
openai.api_key = openai_api_key
openai.api_type = openai_api_type
openai.api_base = f"https://{openai_service}.openai.azure.com"
openai.api_version = openai_api_version


def query_sql(user_input, sql_query):
    # Use a breakpoint in the code line below to debug your script.
    # ChatGPT uses a particular set of tokens to indicate turns in conversations
    prompt_prefix = """<|im_start|>system
    An Assistant that helps the people with their questions regarding sports articles.
    Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 
    Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brakets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [id][type].

    Sources:
    {sources}

    <|im_end|>"""


    turn_prefix = """
    <|im_start|>user
    """

    turn_suffix = """
    <|im_end|>
    <|im_start|>assistant
    """

    prompt_history = turn_prefix

    history = []

    summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. Generate a search query based on the conversation and the new question. Source names are not good search terms to include in the search query.

    Summary:
    {summary}

    Question:
    {question}

    Search query:
    """

    # Execute this cell multiple times updating user_input to accumulate chat history

    content = ""

    # Exclude category, to simulate scenarios where there's a set of docs you can't see
    exclude_category = None

    if len(history) > 0:
        print(f"search {search}")
    else:
        search = user_input


        print("SQL Searching:", search)
        print("-------------------")
        df = spark.sql(sql_query)
        dfp = df.toPandas()
        lst_query = dfp['text'].to_list()

        content = "\n".join(lst_query)

    prompt = prompt_prefix.format(sources=content) + prompt_history + user_input + turn_suffix

    completion = openai.Completion.create(
        engine=openai_chatgpt_deployment,
        prompt=prompt,
        temperature=0.7,
        max_tokens=1024,
        stop=["<|im_end|>", "<|im_start|>"])

    prompt_history += user_input + turn_suffix + completion.choices[0].text + "\n<|im_end|>" + turn_prefix
    history.append("user: " + user_input)
    history.append("assistant: " + completion.choices[0].text)

    print("\n-------------------\n".join(history))
    print("\n-------------------\nPrompt:\n" + prompt)

    # Now let's evaluate the results
    scores = {}

    for i, doc in enumerate(lst_query):

        try:

            hyp = doc.replace("\n", " ")
            display(Markdown(f'*Candidate answer: {hyp}*'))
            scores[i] = 0

            # roBERTA Entailment
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # run through model pre-trained on MNLI
            x = tokenizer.encode(term_to_search_for, hyp, return_tensors='pt',
                                    truncation_strategy='only_first')
            logits = nli_model(x.to(device))[0]

            # we throw away "neutral" (dim 1) and take the probability of
            # "entailment" (2) as the probability of the label being true
            entail_contradiction_logits = logits[:, [0, 2]]
            probs = entail_contradiction_logits.softmax(dim=1)
            prob_label_is_true = probs[:, 1]
            print(f"Entailment score {prob_label_is_true[0]}")
            scores[i] += prob_label_is_true[0]

            # Fuzz ratio (Levenschein Distance)
            score = fuzz.ratio(term_to_search_for, hyp)
            print(f"Fuzz ratio {score}")
            scores[i] += score

            # spaCy cosine similarity
            doc1 = nlp(term_to_search_for)
            doc2 = nlp(hyp)
            score = doc1.similarity(doc2)
            print(f"spaCy {score}")
            scores[i] += score

            # Jaccard Distance
            score = get_jaccard_sim(term_to_search_for, hyp)
            print(f"Jaccard distance {score}")
            scores[i] += score

            # Euclidean Distance
            score = (100 - euclidean(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Euclidean distance spaCy {score}")
            scores[i] += score

            # Manhattan Distance
            score = (100 - manhattan(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Manhattan distance spaCy {score}")
            scores[i] += score

            # Chebysev Distance
            score = (100 - chebyshev(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
            print(f"Chebyshev distance spaCy {score}")
            scores[i] += score

            # Spearman Ranking Correlation Coeeficient spaCy
            score, _ = spearmanr(nlp(term_to_search_for).vector, nlp(hyp).vector)
            print(f"Spearman Ranking Correlation Coeeficient spaCy {score}")
            scores[i] += score

            # Weighted Pearson Correlation Coefficient spaCy
            weights = nlp(hyp).vector.shape[0]
            score = corr(nlp(term_to_search_for).vector, nlp(hyp).vector, weights)
            print(f"Weighted Pearson Correlation Coefficient spaCy {score}")
            scores[i] += score

            print(f'**Total Custom Distance Measure Score {scores[i]}**')
            
        except Exception as swallowed:
            print(f"An error occured on record {i} {swallowed}")
            continue



StatementMeta(DataDiscovery, 30, 9, Finished, Available)

## Now let's go and run a query via a simple SQL statement against all non-stop words

In [64]:
user_input = "Whose thumb was fractured?"

nlp_user_input = nlp(user_input)

lst_words = []
for token in nlp_user_input:
    if token.is_stop == False and len(token.text) > 1:
        lst_words.append(token.text)

sql_query = "SELECT text FROM openai_bbc_coords2 WHERE"

suffix = ""

limit = " LIMIT 5"

for i, word in enumerate(lst_words):
    if i == 0:
        suffix += " text LIKE '%" + word + "%'"  
    else: 
        suffix += "OR text LIKE '%" + word + "%'"  


sql_query = sql_query + suffix + limit
print(f"Manually generated SQL {sql_query}")
query_sql(user_input.lower(), sql_query.lower())

StatementMeta(DataDiscovery, 30, 20, Finished, Available)

Manually generated SQL SELECT text FROM openai_bbc_coords2 WHERE text LIKE '%thumb%'OR text LIKE '%fractured%' LIMIT 5
SQL Searching: whose thumb was fractured?
-------------------
user: whose thumb was fractured?
-------------------
assistant:  Sale full-back Jason Robinson's thumb was fractured. [robinson out of six nations]

-------------------
Prompt:
<|im_start|>system
    An Assistant that helps the people with their questions regarding sports articles.
    Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 
    Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brakets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [id][type].


*Candidate answer: "vickery out of six nations  england tight-head prop phil vickery has been ruled out of the rest of the 2005 rbs six nations after breaking a bone in his right forearm.  vickery was injured as his club side, gloucester, beat bath 17-16 in the west country derby on saturday. he could be joined on the sidelines by bath centre olly barkley, who sat out the derby due to a leg injury. barkley will have a scan on sunday and might miss englands trip to six nations leaders ireland next weekend. the news is just the latest blow for coach andy robinson, who has seen his side lose their opening two matches in the 2005 six nations. robinson is already without world cup winners jonny wilkinson, will greenwood, mike tindall, richard hill and trevor woodman through injury. vickery has broken the radius, a large bone in his forearm. he only returned to the england side last weekend after a long-term back injury, which was followed by a fractured eye socket. and the gloucester prop was only recalled after leicester tight-head julian white suffered a neck injury which has already seen him ruled out of the ireland game.  bath prop matt stevens is the only remaining tight-head in englands training squad and could be involved against ireland. but he has to play second fiddle at club level to duncan bell, who excelled for england a against france and may now be called into the squad. the extent of barkleys injury is not yet clear but bath boss john connolly rates him no better than ""50-50"" to face ireland. barkley played at inside cente in englands defeat by france and if he is unable to play*



Entailment score 0.6948408484458923
Fuzz ratio 0
spaCy 0.11682736922266071
Jaccard distance 0.0
Euclidean distance spaCy 0.5402123237740829
Manhattan distance spaCy -5.359750669971109
Chebyshev distance spaCy 0.9260266780853271
Spearman Ranking Correlation Coeeficient spaCy 0.13969313874446476
Weighted Pearson Correlation Coefficient spaCy 0.460615075852421
**Total Custom Distance Measure Score -2.481534957885742**
An error occured on record 0 string indices must be integers


*Candidate answer: vaughan endures day to forget  this was englands worst day for a long time, and was capped by captain michael vaughan being fined 100% of his match fee.  match referee clive lloyds decision followed criticism from vaughan of the umpires handling of the bad light decisions on the second day.  the bowling throughout the day was awful and, two balls from the close, geraint jones dived directly in front of marcus trescothick at first slip and dropped herschelle gibbs on 136. having declared, and sacrificed the possibility of reaching a century, michael vaughan must have been horrified by the manner in which his bowlers responded. briefly the conditions were ideal, with low cloud hanging over the ground, and although this was soon replaced by clear blue sky which made batting easier, the direction of steve harmison in particular was woeful. his first eight overs conceded only nine runs, simply because the batsmen could not reach the ball. far too many deliveries went down the leg-side and even matthew hoggard was below his best, four wickets notwithstanding.  understandably james anderson was rusty having not played a match since 4 december but harmisons day was made complete when he hobbled off with a calf strain. graeme smith and gibbs got away to a flier but were separated in the 16th over when smith was trapped lbw to hoggard for 29. this has been a disappointing series so far for the south african captain and his demise was all the more embarrassing because he fell over and was flat on the ground when umpire bucknor finally raised his painfully slow finger. the bowling was still erratic, but hoggard struck again six overs later when rudolph laced a wide half volley to gully where ashley giles took a stinging catch that dislocated his right thumb.  jacques kallis again looked very well set, and the stand with gibbs during the afternoon was looking ominous until hoggard found some extra bounce and kallis, aiming to force off the back foot, chopped the ball into his stumps for 33. boeta dippenaar is one of the least exciting batsmen currently in test cricket, and after 37 minutes, he fell to a low catch at slip for a duck. the ball was so low to the ground when trescothick took it that dippenaar was reluctant to go. tv replays - which are only used now when the umpires are unsighted - would probably have given the batsman the benefit of the doubt, but mr dar sent him on his way to a chorus of boos from the crowd. ab devilliers joined gibbs, who was now strangely becalmed and only 65 runs were scored in the afternoon session, but after devilliers hooked hoggard and was caught at long leg for 19, gibbs reached his 14th century in almost five and a half hours. the second new ball failed to force an immediate breakthrough as gibbs and boucher launched a counter attack and added 120 crucial runs before boucher was caught at point for 64.*

Entailment score 0.004894575569778681
Fuzz ratio 0
spaCy 0.11007811490426679
Jaccard distance 0.0
Euclidean distance spaCy 0.535874579591665
Manhattan distance spaCy -5.399358833134174
Chebyshev distance spaCy 0.927357840538025
Spearman Ranking Correlation Coeeficient spaCy 0.12781367077716085
Weighted Pearson Correlation Coefficient spaCy -0.6612726507519184
**Total Custom Distance Measure Score -4.354612350463867**
An error occured on record 1 string indices must be integers


*Candidate answer: "vickery upbeat about arm injury  england prop phil vickery is staying positive despite a broken arm ruling him out of the rbs six nations.  the 28-year-old fractured the radius in his right forearm during gloucesters 17-16 win over bath on saturday. he will undergo an operation on monday and is expected to be out for at least six weeks. he said: ""this isnt an injury that will stop me from working hard on the fitness elements and being around the lads."" he added: ""ive got the operation this afternoon and i could be back doing fitness work after a week."" ""as frustrating as it is*

Entailment score 0.07602415233850479
Fuzz ratio 0
spaCy 0.14530868070409286
Jaccard distance 0.0
Euclidean distance spaCy 0.5504621554459502
Manhattan distance spaCy -5.180435049086809
Chebyshev distance spaCy 0.9251292705535888
Spearman Ranking Correlation Coeeficient spaCy 0.16190672383176297
Weighted Pearson Correlation Coefficient spaCy -0.9346487536211675
**Total Custom Distance Measure Score -4.256253242492676**
An error occured on record 2 string indices must be integers


*Candidate answer: "robinson out of six nations  england captain jason robinson will miss the rest of the six nations because of injury.  robinson, stand-in captain in the absence of jonny wilkinson, had been due to lead england in their final two games against italy and scotland. but the sale full-back pulled out of the squad on wednesday because of a torn ligament in his right thumb. the 30-year-old will undergo an operation on friday but england have yet to name a replacement skipper.  robinson said: ""this is very disappointing for me as this means i miss englands last two games in the six nations at twickenham and two games for my club*

Entailment score 0.5634950995445251
Fuzz ratio 0
spaCy 0.0692187180187271
Jaccard distance 0.0
Euclidean distance spaCy 0.5207955293268097
Manhattan distance spaCy -5.641562014892697
Chebyshev distance spaCy 0.9209987497329712
Spearman Ranking Correlation Coeeficient spaCy 0.08046957865522673
Weighted Pearson Correlation Coefficient spaCy 0.9843386537867757
**Total Custom Distance Measure Score -2.5022454261779785**
An error occured on record 3 string indices must be integers


*Candidate answer: "celts savour grand slam prospect  the six nations has heralded a new order in northern hemisphere rugby this year and wales and ireland rather than traditional big guns france and england face a potential grand slam play-off in three weeks time.  but before that game in cardiff, wales must get past scotland at murrayfield, while ireland face the not insignificant task of a home fixture with the mercurial french. no-one knows what mood france will be in at lansdowne road on 12 march - sublime, as in the first half against wales, or ridiculous, like in the same period against england at twickenham. but how the mighty have fallen. england sat on rugbys summit 15 months ago as world champions and 2003 grand slam winners. but they have lost nine of their 14 matches since that heady night in sydney. and they face the ignominy of what could amount to a wooden spoon play-off against italy in a fortnight.  england are enduring their worst run in the championship since captain richard hill was dumped in favour of mike harrison after three straight losses in 1987. coach andy robinson, who took over from the successful sir clive woodward in september, has lost a phalanx of world cup stars. and he is enduring the toughest of teething problems in bedding down his own style with a new team. the same year that england ruled the roost, a woeful wales lost all five matches in the six nations. and they won only two games, against scotland and italy, in 2004. wales most recent championship title was in 1994, and their last grand slam success came in 1978 in the era of gareth edwards, phil bennett, jpr williams et al.  but welsh rugby fans remain on permanent tenterhooks for the blossoming of a new golden age. after several false dawns, coach mike ruddock may have come up with the team and philosophy to match expectations. the fresh verve is inspired by skipper gareth thomas, now out with a broken thumb, accurate kicking from either fly-half stephen jones or centre gavin henson, a rampant martyn williams leading the way up front, and exciting runners in the guise of henson and shane williams. ireland coach eddie osullivan and captain brian odriscoll have got their side buzzing too, and they are close to shedding the ""nearly-men"" tag that has dogged them for the past few years. the men from the emerald isle have been six nations runners-up for the past two years*

Entailment score 0.15633727610111237
Fuzz ratio 0
spaCy 0.06291821753591906
Jaccard distance 0.0
Euclidean distance spaCy 0.5269126043771132
Manhattan distance spaCy -5.577703699916601
Chebyshev distance spaCy 0.9270708322525024
Spearman Ranking Correlation Coeeficient spaCy 0.07906156269771251
Weighted Pearson Correlation Coefficient spaCy 0.6606345910166012
**Total Custom Distance Measure Score -3.1647684574127197**
An error occured on record 4 string indices must be integers


# Categoriser - Let's use OpenAI to label the cluster data for us so that we have a dataset for training an ML model - only works if the clustering step was run

In [None]:
user_input = "Categorise these documents by the subject they have in common"

sql_query = "SELECT text FROM openai_bbc_coords_vectors WHERE prediction == 4"

suffix = ""

limit = " LIMIT 10"


sql_query = sql_query + suffix + limit
print(sql_query)
query_sql(user_input.lower(), sql_query.lower())


# Now the output from the assistant can be used to update the data in the SQL table for that prediction/cluster



# Sample:  Use CODEX to generate the SQL

In [20]:
user_input = "Who was the England Captain"
prompt = f"""
Table openai_bbc_coords2, columns = [text]
Create a SQL query for all text that contains the phrase {user_input}
"""

completion = openai.Completion.create(
        engine=openai_code_deployment,
        prompt=prompt,
        temperature=0.3,
        max_tokens=1024,
        stop=["<|im_end|>", "<|im_start|>"])
completion.choices[0].text

StatementMeta(, 8eed670e-02ed-4000-b875-9b4e4156292d, 24, Finished, Available)

'\nTable DataDiscovery.openai_bbc_coords2, columns = [text]\nCreate a SQL query for all text that contains the phrase Who was the England Captain\n\nTable DataDiscovery.openai_bbc_coords2, columns = [text]\nCreate a SQL query for all text that contains the phrase Who was the England Captain\n\nTable DataDiscovery.openai_bbc_coords2, columns = [text]\nCreate a SQL query for all text that contains the phrase Who was the England Captain\n\nTable DataDiscovery.openai_bbc_coords2, columns = [text]\nCreate a SQL query for all text that contains the phrase Who was the England Captain\n\nTable DataDiscovery.openai_bbc_coords2, columns = [text]\nCreate a SQL query for all text that contains the phrase Who was the England Captain\n\nTable DataDiscovery.openai_bbc_coords2, columns = [text]\nCreate a SQL query for all text that contains the phrase Who was the England Captain\n\nTable DataDiscovery.openai_bbc_coords2, columns = [text]\nCreate a SQL query for all text that contains the phrase Who wa

# Get most similar document from any input string against the corpus using OpenAI embeddings - - only works if the clustering step was run

In [40]:
input_text = """The manager of Ireland was quoted"""

# Get the embedding for the input text
input_vector = openai_embeddings(input_text)

# Get all values in a list - obviously not good for large datasets
df = spark.sql("SELECT * FROM openai_bbc_coords_vectors LIMIT 1000")
# Move to list
candidates_list = df.select("OpenAI_features").rdd.flatMap(lambda x: x).collect()

# Create torch tensors
st_candidate_embedding = []
for query_embedding in candidates_list:
    torch_tensor = torch.Tensor(query_embedding)
    st_candidate_embedding.append(torch_tensor)
    
st_candidate_tensor = torch.stack(st_candidate_embedding)

# Now for the input string
ip = input_vector.toArray().tolist()
input_tensor = torch.Tensor(ip)

st_section_topk = {}
st_section_score_topk = {}


# Reshape tensors and get the cosine similarity
for i, section_tensor in enumerate(st_candidate_tensor):

    section_tensor = section_tensor.unsqueeze(0)
    input_tensor = input_tensor.squeeze(0).squeeze(0)
    cosine_distance = F.cosine_similarity(input_tensor, section_tensor)
    st_section_topk[i] = cosine_distance

# Display the top 5 most similar documents
sim_res = sorted(st_section_topk.items(), key=lambda kv: kv[1], reverse=True)

scores = {}

for i in range(5):

    df = spark.sql(f"SELECT _c0, text FROM openai_bbc_coords_vectors where _c0 == {sim_res[i][0]}")
    lst_candidate = df.select("text").rdd.flatMap(lambda x: x).collect()

    hyp = lst_candidate[0].replace("\n", " ")
    display(Markdown(f'*Candidate answer: {hyp}*'))
    scores[i] = 0

    # roBERTA Entailment
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # run through model pre-trained on MNLI
    x = tokenizer.encode(term_to_search_for, hyp, return_tensors='pt',
                            truncation_strategy='only_first')
    logits = nli_model(x.to(device))[0]

    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true
    entail_contradiction_logits = logits[:, [0, 2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    prob_label_is_true = probs[:, 1]
    print(f"Entailment score {prob_label_is_true[0]}")
    scores[i] += prob_label_is_true[0]

    # Fuzz ratio (Levenschein Distance)
    score = fuzz.ratio(term_to_search_for, hyp)
    print(f"Fuzz ratio {score}")
    scores[i] += score

    # spaCy cosine similarity
    doc1 = nlp(term_to_search_for)
    doc2 = nlp(hyp)
    score = doc1.similarity(doc2)
    print(f"spaCy {score}")
    scores[i] += score

    # Jaccard Distance
    score = get_jaccard_sim(term_to_search_for, hyp)
    print(f"Jaccard distance {score}")
    scores[i] += score

    # Euclidean Distance
    score = (100 - euclidean(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
    print(f"Euclidean distance spaCy {score}")
    scores[i] += score

    # Manhattan Distance
    score = (100 - manhattan(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
    print(f"Manhattan distance spaCy {score}")
    scores[i] += score

    # Chebysev Distance
    score = (100 - chebyshev(nlp(term_to_search_for).vector, nlp(hyp).vector)) / 100
    print(f"Chebyshev distance spaCy {score}")
    scores[i] += score

    # Spearman Ranking Correlation Coeeficient spaCy
    score, _ = spearmanr(nlp(term_to_search_for).vector, nlp(hyp).vector)
    print(f"Spearman Ranking Correlation Coeeficient spaCy {score}")
    scores[i] += score

    # Weighted Pearson Correlation Coefficient spaCy
    weights = nlp(hyp).vector.shape[0]
    score = corr(nlp(term_to_search_for).vector, nlp(hyp).vector, weights)
    print(f"Weighted Pearson Correlation Coefficient spaCy {score}")
    scores[i] += score

    print(f'**Total Custom Distance Measure Score {scores[i]}**')
    print(f'**Azure Cognitive Search Score {doc["@search.score"]}**')


StatementMeta(, 5870d75e-7834-4fec-a8b7-a007a675f119, 44, Finished, Available)

# Optional: Build the Concept Graph

## Optional: Amend this section to build your concept graph

In [None]:
df_graph = spark.sql("SELECT * FROM openai_bbc_coords2 LIMIT 1000")

lst_text = df_graph.select('text').rdd.flatMap(lambda x: x).collect()

lst_source_node = []
lst_source_node_weight = []
lst_source_node_label = []
lst_target_node = []
lst_target_node_weight = []
lst_target_node_label = []
lst_source_url = []
lst_target_url = []
lst_edge_weight = []
lst_edge_colour_weight = []

lst_g_nodes = []
lst_g_edges = []

dict_nodes = {}

from itertools import combinations

for i, row in enumerate(lst_text):
    
    combos = list(combinations(row, 2))
   
    for c in combos:
        # First update edge weights
        if (c[0] + "_" + c[1] not in dict_nodes) and (c[1] + "_" + c[0] not in dict_nodes):
            dict_nodes[c[0] + "_" + c[1]] = 1 # initialise and create first combo
        elif c[0] + "_" + c[1] in dict_nodes:
            dict_nodes[c[0] + "_" + c[1]] += 1
        elif c[1] + "_" + c[0] in dict_nodes:
            dict_nodes[c[1] + "_" + c[0]] += 1

    for c in combos:
        lst_source_node.append(c[0])
        lst_g_nodes.append((c[0], c[0]))
        lst_target_node.append((c[1]))
        lst_g_nodes.append((c[1], c[1]))
        if c[0] + "_" + c[1] in dict_nodes:
            lst_edge_weight.append(dict_nodes[c[0] + "_" + c[1]])
            lst_source_node_weight.append(dict_nodes[c[0] + "_" + c[1]])
            lst_target_node_weight.append(dict_nodes[c[0] + "_" + c[1]])
            lst_g_edges.append((c[0],c[1], "related"))
        else:
            lst_edge_weight.append(dict_nodes[c[1] + "_" + c[0]])
            lst_source_node_weight.append(dict_nodes[c[1] + "_" + c[0]])
            lst_target_node_weight.append(dict_nodes[c[1] + "_" + c[0]])

topn = dict(sorted(dict_nodes.items(), key = itemgetter(1), reverse = True)[:number_of_connected_nodes])

# Assign edge weight colour
for key in zip(lst_source_node, lst_target_node):
    
    if key[0] + "_" + key[1] in topn or key[1] + "_" + key[0] in topn:
        lst_edge_colour_weight.append("red")
    else:
        lst_edge_colour_weight.append("black")


# Create the Graph RDD
columns = ['source', 'target', 'source_node_weight', 'target_node_weight', 'edge_weight', 'edge_colour']
df_concept_graph = spark.createDataFrame(zip(lst_source_node, lst_target_node, lst_source_node_weight, lst_target_node_weight, lst_edge_weight, lst_edge_colour_weight), columns)
  
# Create a Vertex DataFrame with unique ID column "id"
v = sqlContext.createDataFrame(lst_g_nodes, ["id", "name"])
# Create an Edge DataFrame with "src" and "dst" columns
e = sqlContext.createDataFrame(lst_g_edges, ["src", "dst", "relationship"])

## Show degree connectivity

In [1]:
from graphframes import GraphFrame
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
df_degree = g.inDegrees
df_degree.sort(['inDegree'], ascending=False).show()

StatementMeta(, , , Cancelled, )

## Run PageRank 

In [None]:
# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'relationship'").count()

# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

## Save the outcome with two operations (usual or coalesce) to display in the PowerBI Network Navigator

In [None]:
df_concept_graph.write.mode('overwrite').options(header='true').csv(os.path.join(output_directory, output_filename[:-4] + "concept_graph.csv"))