### Installing the required python libraries (e.g. openai)

In [98]:
import os
import re
import sys
import subprocess
import importlib.metadata
from time import sleep
from azure.identity import DefaultAzureCredential

required = ['openai', 'num2words', 'openai[embeddings]']
for pkg in required:
    print(f'Checking for {pkg}...')
    try:
        importlib.metadata.version(pkg)
    except importlib.metadata.PackageNotFoundError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])

az_credential = DefaultAzureCredential()

Checking for openai...
Checking for num2words...
Checking for openai[embeddings]...


### Importing the required python modules

In [12]:
import pandas as pd
import numpy as np
import openai
import requests
from num2words import num2words
from openai.embeddings_utils import get_embedding, cosine_similarity

# Define colors to print in the console
BLUE = '\033[94m'
CYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'

### Configure your Azure Open AI Endpoint & API Key

You can find the Endpoint & Key in the **Keys and Endpoint** section of your Open AI resource on the left navigation pane.

In [13]:
# Update the RESOURCE_ENDPOINT to your Azure Open AI Endpoint
RESOURCE_ENDPOINT = 'https://demo-1.openai.azure.com/'

# Update the API_KEY Azure Open AI API Key
API_KEY = 'example3xjqidkobjcod6fvnvexample'

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

Necessary functions which will be used later

* `normalise_text` - Function to normalize the input text to remove multiple spaces, additional punctuation, etc.
* `individual_summary` - Based on the given text prompt, get the summarisation using `test-davinci-003` model. You need to update the value of **engine** attribute to the name of the model that you have deployed.
* `summarise_review` - Takes data frame, asin, and category as input to calculate the summary of all the reviews for the specific asin in that category
* `get_summary_output` - Takes summarised text as input and extract the top result out of it
* `summary_response` - Takes data frame and asin as input to get the summary of text for all 3 categories (Quality, Cost, and Delivery) by calling `summarise_review` function
* `sentiment_count` - Takes data frame and asin as input to detect the setniment across all 3 categories.
* `asin_review_data` - Takes data frame and asin as input to summarise the text and get the sentiment for that specific asin.



In [76]:
def normalise_text(s, sep_token = " \n "):
    if type(s) != str:
        print(s)
        return s
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s


def individual_summary(r_text):
    response = openai.Completion.create(
        engine="test-davinci-003",
        prompt=r_text,
        temperature=0.7,
        max_tokens=100,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        best_of=1,
        stop=None)
    return response


def summarise_review(df, asin, category):
    df_filtered = df[(df['asin'] == asin) & (df['category'] == category)]
    if len(df_filtered) == 0:
        return 'Empty'

    df_filtered.reset_index(inplace=True, drop=True)
    review_text = ''
    for i in range(len(df_filtered)):
        review_text += df_filtered['reviewText'][i]
        review_text += ' '

    review_text = normalise_text(review_text.rstrip()) + '\n\nTl;dr'
    if len(review_text) < 3900:
        return individual_summary(review_text)
    else:
        review_chunk = ''
        chunks = len(review_text) // 3900
        for i in range(1, chunks + 1):
            temp_response = individual_summary(review_text[3900 * (i - 1):3900 * i])
            for t_res in temp_response['choices']:
                if t_res['index'] == 0:
                    review_chunk += t_res['text'].removeprefix(': ')
                    break

        return individual_summary(review_chunk)


def get_summary_output(summary_data):
    if summary_data == 'Empty':
        return 'No Reviews'

    for summary in summary_data['choices']:
            if summary['index'] == 0:
                return summary['text'].removeprefix(': ')


def summary_response(df, asin):
    quality_response = summarise_review(df, asin, 'Quality')
    cost_response = summarise_review(df, asin, 'Cost')
    delivery_response = summarise_review(df, asin, 'Delivery')
    summaries = {
        'quality': get_summary_output(quality_response),
        'cost': get_summary_output(cost_response),
        'delivery': get_summary_output(delivery_response)
    }
    return summaries


def sentiment_count(df, asin):
    df_filtered = df[df['asin'] == asin]
    quality_sen = df_filtered[df_filtered['category'] == 'Quality'].sentiment.value_counts()
    cost_sen = df_filtered[df_filtered['category'] == 'Cost'].sentiment.value_counts()
    delivery_sen = df_filtered[df_filtered['category'] == 'Delivery'].sentiment.value_counts()
    sentiment_data = {
        'quality': {
            'positive': quality_sen.get(key = 'Positive') or 0,
            'negative': quality_sen.get(key = 'Negative') or 0,
            'neutral': quality_sen.get(key = 'Neutral') or 0
        },
        'cost': {
            'positive': cost_sen.get(key = 'Positive') or 0,
            'negative': cost_sen.get(key = 'Negative') or 0,
            'neutral': cost_sen.get(key = 'Neutral') or 0
        },
        'delivery': {
            'positive': delivery_sen.get(key = 'Positive') or 0,
            'negative': delivery_sen.get(key = 'Negative') or 0,
            'neutral': delivery_sen.get(key = 'Neutral') or 0
        }
    }
    return sentiment_data


def asin_review_data(df, asin):
    review_summaries = summary_response(df, asin)
    senitment_data = sentiment_count(df, asin)
    return review_summaries, senitment_data

Load the appliances data as a pandas dataframe

In [83]:
df = pd.io.json.read_json('Appliances_5.json', lines=True)
df.sample(n=5)

(2277, 6)

Create a new data frame only with the columns `asin`, `reviewText`, and `summary`.

In [82]:
df_stage = df[['asin', 'reviewText', 'summary']]
df_stage.shape

(2277, 3)

Drop the NaN values and normalize the text by removing additional spaces and punctuations

In [59]:
df_stage = df_stage.dropna()
df_stage['reviewText'] = df_stage['reviewText'].apply(lambda x : normalise_text(x))

Call Open AI API to classify the text as either **Cost**, **Quality**, or **Delivery** and store them in the same dataframe in a new column named `category`.

In [132]:
def classify_text(s):
    conc_string = f'Classify the following review into 1 of the following categories:  categories: [Cost, Quality, Delivery]\n\nreview: {s}\n\nClassified category:'
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=conc_string,
        temperature=0,
        max_tokens=100,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        best_of=1,
        stop=None)
    sleep(0.4)
    for i in response['choices']:
        if i['index'] == 0:
            return i['text'].strip()

df_stage['category'] = df_stage['reviewText'].apply(lambda x : classify_text(x))

In [15]:
def detect_sentiment(s):
    conc_string = f'Identify the sentiment in the following review into 1 of the following categories:  categories: [Positive, Neutral, Negative]\n\nreview: {s}\n\nClassified category:'
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=conc_string,
        temperature=0,
        max_tokens=100,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        best_of=1,
        stop=None)
    sleep(0.4)
    for i in response['choices']:
        if i['index'] == 0:
            return i['text'].strip()

# df_stage['sentiment'] = df_stage['reviewText'].apply(lambda x : detect_sentiment(x))

In [14]:
# df_stage = pd.read_csv('category_class_output.csv')
df_stage.shape

(2277, 5)

In [96]:
df_stage.sample(n=5)

Unnamed: 0.1,Unnamed: 0,asin,reviewText,summary,category,sentiment
1311,1311,B0006GVNOA,Great product but they need to include more ro...,"Works Great, Just Use Common Sense When Doing ...",Quality,Neutral
116,116,B0006GVNOA,We have 24 foot of solid dryer vent pipe endin...,This really worked well even taking shortcuts,Quality,Positive
2062,2062,B0006GVNOA,first thing first: it works. the kit is great ...,good kit with some caveats,Cost,Neutral
603,603,B0006GVNOA,Works great. I used it and an extension kit wi...,Worked like a charm,Quality,Positive
2146,2146,B0042U16YI,A+,Five Stars,Quality,Positive


Write your dataframe to CSV which enable you to load it as dataframe later instead of making API calls to your model every time for detecting the sentiment and classifying the review text. 

In [17]:
# df_stage.to_csv('category_class_output.csv', index = True)

Get summary of individual text

In [None]:
def individual_summary(r_text):
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=r_text,
        temperature=0.7,
        max_tokens=100,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        best_of=1,
        stop=None)
    return response

Get the count of sentiment for specific ASIN

In [95]:
asin_list = pd.DataFrame(df_stage.asin.value_counts()).reset_index()
asin_list.columns = ['asin', 'review_count']
asin_list[(asin_list['review_count'] > 5) & (asin_list['review_count'] < 25)]

Unnamed: 0,asin,review_count
1,B0014CN8Y8,24
2,B00JGTUQJ6,13
3,B0015UGPWQ,12
4,B00CW0O1EW,8
5,B0056I99WG,8
6,B004XLDE5A,8
7,B000XXWOGW,8
8,B0053F80JA,7
9,B000NCTOUM,7
10,B000N6302Q,7


In [88]:
asin_num = input('Enter the asin: ')
review_summaries, senitment_data = asin_review_data(df_stage, asin_num)
print('Find below the details about review of this product:')
print(f'{BOLD}{CYAN}Quality:{ENDC}')
print(f'\tSummary: {review_summaries["quality"]}')
print(f'\tNumber of {OKGREEN}Positive{ENDC} reviews: {senitment_data["quality"]["positive"]}')
print(f'\tNumber of {FAIL}Negative{ENDC} reviews: {senitment_data["quality"]["negative"]}')
print(f'\tNumber of {BLUE}Neutral{ENDC} reviews: {senitment_data["quality"]["neutral"]}')
print(f'{BOLD}{CYAN}Cost:{ENDC}')
print(f'\tSummary: {review_summaries["cost"]}')
print(f'\tNumber of {OKGREEN}Positive{ENDC} reviews: {senitment_data["cost"]["positive"]}')
print(f'\tNumber of {FAIL}Negative{ENDC} reviews: {senitment_data["cost"]["negative"]}')
print(f'\tNumber of {BLUE}Neutral{ENDC} reviews: {senitment_data["cost"]["neutral"]}')
print(f'{BOLD}{CYAN}Delivery:{ENDC}')
print(f'\tSummary: {review_summaries["delivery"]}')
print(f'\tNumber of {OKGREEN}Positive{ENDC} reviews: {senitment_data["delivery"]["positive"]}')
print(f'\tNumber of {FAIL}Negative{ENDC} reviews: {senitment_data["delivery"]["negative"]}')
print(f'\tNumber of {BLUE}Neutral{ENDC} reviews: {senitment_data["delivery"]["neutral"]}')

Find below the details about review of this product:
[1m[96mQuality:[0m
	Summary: This dryer vent hose fit perfectly and was very easy to install with the provided clamps, or with screw clamps. It was the perfect height and worked great.
	Number of [92mPositive[0m reviews: 10
	Number of [91mNegative[0m reviews: 0
	Number of [94mNeutral[0m reviews: 2
[1m[96mCost:[0m
	Summary: It is value for money
	Number of [92mPositive[0m reviews: 1
	Number of [91mNegative[0m reviews: 0
	Number of [94mNeutral[0m reviews: 0
[1m[96mDelivery:[0m
	Summary: No Reviews
	Number of [92mPositive[0m reviews: 0
	Number of [91mNegative[0m reviews: 0
	Number of [94mNeutral[0m reviews: 0
