Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Using OpenAI to classify the BBC Sports documents

## This cell configures the spark session - Do not change

In [27]:
%%configure -f
{
"conf": {
     "spark.rpc.message.maxSize": 1024,
     "spark.kryoserializer.buffer.max": "256m"
   }
}

StatementMeta(, 13, -1, Finished, Available)


## These are the parameters that need to be changed to your values

In [28]:
# The input file name 
input_filename = 'abfss://share@datadiscoverypipeline.dfs.core.windows.net/sport_articles.csv'
# The output directory where the output file will be written to
output_directory = 'abfss://share@datadiscoverypipeline.dfs.core.windows.net/sport_articles/output/'
# The name of the output file
output_filename = 'bbc_classification.csv'

# The OpenAI deployment
deployment_name = ""
# The OpenAI key
key=""
# The OpenAI service name e.g. myopenai - just the prefix of the endpoint
service_name=""

# Azure SubscriptionId
subscription_id=""
# AzureML Workspace Resource Group
resource_group=""
# AzureML Workspace Name
workspace_name=""



StatementMeta(DataDiscovery, 13, 1, Finished, Available)

## Track the Experiment in Azure ML

In [None]:
from azureml.core import Workspace, Experiment, Run
import mlflow

ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name)    
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
experiment_name = f"({mssparkutils.runtime.context['notebookname']}_{str(mssparkutils.env.getJobId())})"
mlflow.set_experiment(experiment_name)
mlflow.log_param("input_filename", input_filename)
mlflow.log_param("number_of_clusters", number_of_clusters)
mlflow.log_param("output_directory", output_directory)
mlflow.log_param("output_filename", output_filename)
mlflow.log_param("account_url", account_url)
mlflow.log_param("account_name", account_name)
mlflow.log_param("file_system_name", file_system_name)
mlflow.log_param("directory_name", directory_name)
mlflow.log_param("SCATTER_PLOT_3D", SCATTER_PLOT_3D)
mlflow.log_param("LOW_MEMORY_MODE", LOW_MEMORY_MODE)
params = {
    "sparkpool": mssparkutils.runtime.context['sparkpool'],
    "workspace": mssparkutils.runtime.context['workspace'],
    "notebookname": mssparkutils.runtime.context['notebookname'],
    "isForPipeline": mssparkutils.runtime.context['isForPipeline'],
    "pipelinejobid": mssparkutils.runtime.context['pipelinejobid']
}

mlflow.log_params(params)
mlflow.pyspark.ml.autolog()

In [29]:
from pyspark.ml import Pipeline
from pyspark import SparkContext, SparkConf
import sys
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession
import ntpath
import os
import numpy as np
from pyspark.sql.functions import  col, udf,mean,  concat, lit
from pyspark.sql import functions as F
from synapse.ml.cognitive import OpenAICompletion

df = spark.read.load(input_filename, header=True, format='csv')

StatementMeta(DataDiscovery, 13, 2, Finished, Available)

In [35]:
df = df.select(concat(lit("Classify the following news article into only 1 of the following categories: [Football, Athletics, Cricket, Rugby, Tennis]"), col('text')).alias('prompt'), "filename", "text")

classification = (
    OpenAICompletion()
    .setSubscriptionKey(key)
    .setDeploymentName(deployment_name)
    .setUrl("https://{}.openai.azure.com/".format(service_name))
    .setMaxTokens(200)
    .setPromptCol("prompt")
    .setErrorCol("error")
    .setOutputCol("classification_output")
)

classification_df = classification.transform(df).select(
        col("classification_output"),
        col("error"),
        col('filename'),
        col("classification_output.choices.text").getItem(0).alias("classification")
)



StatementMeta(DataDiscovery, 13, 8, Finished, Available)

# Generate the actual labels from the file names


In [40]:
import re
def find_first_digit(s1):

    return_index = -1
    return_result = ""
    m = re.search(r"\d", s1)
    
    if m:
        return_index = m.start()
        return_result = s1[:m.start()]
              
    return return_result

udf_get_label = udf(find_first_digit, StringType())

classification_df = classification_df.withColumn("actual_sport_classification", udf_get_label(classification_df.filename))

StatementMeta(DataDiscovery, 13, 13, Finished, Available)

# Search for the label

In [43]:
from flashtext import KeywordProcessor
list_of_sports = ["athletics", "football", "rugby", "tennis", "cricket"]

keyword_processor = KeywordProcessor()

for sport in list_of_sports:
    keyword_processor.add_keyword(sport)


def classify(row):              
    keywords_found_top = keyword_processor.extract_keywords(row, span_info=True)

    if len(keywords_found_top) > 0:
        found = sorted(keywords_found_top, reverse=False, key=lambda x:x[1])[0][0]
        # Unfortunately we have spelling mistake in a label so let's apply it
        if found == "athletics":
            found = "atheletics"
    else:
        found = ""
    return found

udf_classify = udf(classify, StringType())
classification_df = classification_df.withColumn("openai_sport_classification", udf_classify(classification_df.classification))

StatementMeta(DataDiscovery, 13, 16, Finished, Available)

In [44]:
display(classification_df)

StatementMeta(DataDiscovery, 13, 17, Finished, Available)

SynapseWidget(Synapse.DataFrame, a82032d3-25fa-47ae-a1e7-3f44aa738a7a)

In [45]:
y_true = classification_df.select("actual_sport_classification").rdd.flatMap(lambda x: x).collect()
y_pred = classification_df.select("openai_sport_classification").rdd.flatMap(lambda x: x).collect()

StatementMeta(DataDiscovery, 13, 18, Finished, Available)

# Let's evaluate

In [46]:
from sklearn.metrics import classification_report, confusion_matrix

print(len(y_pred), len(y_true))


print(classification_report(y_true, y_pred))

StatementMeta(DataDiscovery, 13, 19, Finished, Available)

737 737
              precision    recall  f1-score   support

                   0.00      0.00      0.00         0
  atheletics       0.99      0.84      0.91       101
     cricket       1.00      0.87      0.93       124
    football       0.78      0.99      0.88       265
       rugby       0.99      0.63      0.77       147
      tennis       1.00      0.98      0.99       100

    accuracy                           0.88       737
   macro avg       0.79      0.72      0.75       737
weighted avg       0.92      0.88      0.88       737

