In [100]:
import pandas as pd
import ollama
from ollama import chat
from pydantic import BaseModel
from openai import OpenAI

# Set Display Settings
pd.set_option('display.max_colwidth', 70)

In [108]:
# Model and Prompt specific settings
predefined_topics = ['Oil Issue', 'Door Issue', 'Brake Issue', 
                     'Service Issue', 
                     'Communication', 
                     'Behaviour',
                     'Repair Cost', 'Repair Time',
                    'Others']

allowed_sentiments = ['Positive', 'Negative', 'Neutral']

# OpenAI API Key
api_key = ""

# low Temperature to force the model to follow instructions better
temperature=0.3

# Set this to higher value if large number of output tokens are expected. 5000 is a good default for upto 200-300 reviews. Setting this 
# prevents the model from long hallucinations - it is the max number of output tokens given by the model
max_tokens=5000

# Set how many reviews to analyze - these are the top few reviews which are analyzed
n_reviews_to_analyze = 30

# Do we randomly sample the reviews_to_analyze?
random_sample = True

# Which model to use?
model = "gpt-4o-mini"

######### Data Related Settings ###########
path_of_data = r"D:\Drive_Legion_1\ford_service_review_processed.xlsx"

In [102]:
# Read Data
data=pd.read_excel(path_of_data)
data=data.dropna(subset=['review'])
data["review_original"]=data["review"]
data=data.reset_index()

# Sample which reviews to analyze
if random_sample:
    reviews_to_analyze = data['review_original'].sample(n_reviews_to_analyze)
    indices_of_reviews = reviews_to_analyze.index.values.tolist()
    reviews_to_analyze = reviews_to_analyze.values.tolist()
    
else:
    reviews_to_analyze = data['review_original'].values.tolist()[:n_reviews_to_analyze]


## Use Structured Outputs (Supported by gpt-4o-mini and later) + Ask model to select from predefined topics.

In [103]:
########## Define the format in which the output is expected ##############

# Define Schema of Structured Outputs
class ReviewsSchema(BaseModel):
    Index: int
    Topics: list[str]
    Sentiment: str

class ListSchema(BaseModel):
    Results: list[ReviewsSchema]

In [104]:
# Call OpenAI
client = OpenAI(api_key=api_key
)
completion = client.beta.chat.completions.parse(
    model=model,
    messages=[
        { "role": "system", "content": f'''Analyze the following reviews and do the following 2 things:
            1. Identify the key topics talked about in the review only out of the following comma separated list: {", ".join(predefined_topics)}.
            2. Identify the sentiments expressed in the review from the following comma separated list: {", ".join(allowed_sentiments)}.
        
            Return the output as mentioned in the structure. Reviews are numbered.'''},
            
        {'role': 'user',
            'content': "\n\n".join([str(indices_of_reviews[i]) + '. ' + reviews_to_analyze[i] for i in range(n_reviews_to_analyze)])}
    
    ],
    temperature=temperature,
    max_tokens=max_tokens,
    response_format=ListSchema
)



### View the Output

In [105]:
completion.choices[0].message.parsed.Results

[ReviewsSchema(Index=928, Topics=['Service Issue'], Sentiment='Negative'),
 ReviewsSchema(Index=554, Topics=['Service Issue'], Sentiment='Positive'),
 ReviewsSchema(Index=402, Topics=['Service Issue'], Sentiment='Positive'),
 ReviewsSchema(Index=229, Topics=['Service Issue'], Sentiment='Positive'),
 ReviewsSchema(Index=443, Topics=['Others'], Sentiment='Neutral'),
 ReviewsSchema(Index=777, Topics=['Others'], Sentiment='Neutral'),
 ReviewsSchema(Index=957, Topics=['Service Issue'], Sentiment='Positive'),
 ReviewsSchema(Index=999, Topics=['Service Issue', 'Communication'], Sentiment='Negative'),
 ReviewsSchema(Index=986, Topics=['Service Issue'], Sentiment='Positive'),
 ReviewsSchema(Index=330, Topics=['Service Issue'], Sentiment='Positive'),
 ReviewsSchema(Index=122, Topics=['Repair Cost'], Sentiment='Negative'),
 ReviewsSchema(Index=907, Topics=['Service Issue'], Sentiment='Negative'),
 ReviewsSchema(Index=990, Topics=['Others'], Sentiment='Negative'),
 ReviewsSchema(Index=1100, Topics

In [106]:
results_list = []
for i in range(len(completion.choices[0].message.parsed.Results)):
    current_result = completion.choices[0].message.parsed.Results[i]
    results_list.append([current_result.Index, current_result.Topics, current_result.Sentiment, data["review_original"].iloc[current_result.Index]])
results_df = pd.DataFrame(data = results_list, columns = ['Review Index', 'Topics', 'Sentiment', 'Original Review'])
print(results_df.head(25))

    Review Index                                   Topics Sentiment  \
0            928                          [Service Issue]  Negative   
1            554                          [Service Issue]  Positive   
2            402                          [Service Issue]  Positive   
3            229                          [Service Issue]  Positive   
4            443                                 [Others]   Neutral   
5            777                                 [Others]   Neutral   
6            957                          [Service Issue]  Positive   
7            999           [Service Issue, Communication]  Negative   
8            986                          [Service Issue]  Positive   
9            330                          [Service Issue]  Positive   
10           122                            [Repair Cost]  Negative   
11           907                          [Service Issue]  Negative   
12           990                                 [Others]  Negative   
13    