In [50]:
import pandas as pd
import ollama
from ollama import chat
from pydantic import BaseModel
from openai import OpenAI

# Set Display Settings
pd.set_option('display.max_colwidth', 70)

In [74]:
# Model and Prompt specific settings
predefined_topics = ['Oil Issue', 'Door Issue', 'Brake Issue', 
                     'Service Issue', 'Service Time', 'Service Quality',
                     'Communication', 
                     'Behaviour',
                     'Repair Cost', 'Waiting Time', 'Facilities',
                    'Others']

allowed_sentiments = ['Positive', 'Negative', 'Neutral']

# OpenAI API Key
api_key = ""

# low Temperature to force the model to follow instructions better
temperature=0.01

# Set this to higher value if large number of output tokens are expected. 5000 is a good default for upto 200-300 reviews. Setting this 
# prevents the model from long hallucinations - it is the max number of output tokens given by the model
max_tokens=7000

# Set how many reviews to analyze - these are the top few reviews which are analyzed
n_reviews_to_analyze = 30

# Do we randomly sample the reviews_to_analyze?
random_sample = True

# Which model to use?
model = "gpt-4o-mini"

# Any data points to add? -  add their indices here 
points_to_add = [0, 1, 147]

######### Data Related Settings ###########
path_of_data = r"D:\Drive_Legion_1\ford_service_review_processed.xlsx"

In [75]:
# Read Data
data=pd.read_excel(path_of_data)
data=data.dropna(subset=['review'])
data["review_original"]=data["review"]
data=data.reset_index()

# Sample which reviews to analyze
if random_sample:
    reviews_to_analyze = data['review_original'].sample(n_reviews_to_analyze)
    indices_of_reviews = reviews_to_analyze.index.values.tolist()
    reviews_to_analyze = reviews_to_analyze.values.tolist()
    
    
else:
    reviews_to_analyze = data['review_original'].values.tolist()[:n_reviews_to_analyze]

# Add the Extra Data Point
for ind in points_to_add:
    reviews_to_analyze.append(data['review_original'].values.tolist()[ind])
    n_reviews_to_analyze+=1
    indices_of_reviews.append(ind)

In [76]:
len(reviews_to_analyze)

33

## Use Structured Outputs (Supported by gpt-4o-mini and later) + Ask model to select from predefined topics.

In [77]:
########## Define the format in which the output is expected ##############

# Define Schema of Structured Outputs
class ReviewsSchema(BaseModel):
    Index: int
    Topics: list[str]
    Sentiment: str

class ListSchema(BaseModel):
    Results: list[ReviewsSchema]

In [81]:
# Call OpenAI
client = OpenAI(api_key=api_key
)
completion = client.beta.chat.completions.parse(
    model=model,
    messages=[
        { "role": "system", "content": f'''Analyze the following reviews and do the following 2 things:
            1. Identify the key topics talked about in the review only out of the following comma separated list: {", ".join(predefined_topics)}. Be as specific as possible. If multiple topics are a match for a review, map them all to the review.
            2. Identify the sentiments expressed in the review from the following comma separated list: {", ".join(allowed_sentiments)}.

            Here are a few examples:
            1. Review: "I went to get an inspection and they told me to fix a bulb for $400. When I did it outside, they did it for $49. They were ripping people off. I waited 3 hours and they still couldn’t figure out what was wrong with the car."
                - Chain of Thought:
                  1. The review mentions feeling ripped off which indicates high Service Cost
                  2. Review also takes about waiting for 3 hours hence Service Time
                  3. It also mentions that they could not figure out whats wrong with thw car hence Service Diagnostics
                - Key topics: Service Cost,Service Diagnostics,Service Time
                
            2. Review: "I found only one screw on my left front wheel after recall by this service center. That mistake totally ruined my wheel and cost hundreds of dollars to fix."
                - Chain of Thought:
                  1. The review mentions only one screw was there,which means overall Service quality was bad
                  2. Customer also mentions a specific part as wheel hence identify Wheel/Tires,also this was a recall service hence labelling recall
                - Key topics: Wheel/Tires, Recall,Service Quality

            3. Review: "Worst customer service ever. Took my F150 for a recall. From the start, she was so negative about the issue, told me it would take months to fix, and that I would have to leave the truck there. She is not even a technician and gave a diagnostic before inspecting the car. I was in disbelief. I suggest taking your Ford to a better service department."
                - Chain of Thought:
                 1. The Customer takes about a service person being negative and expresses disappointment also they have mentioned it was  for a recall
                 2.Customer has also expressed that Staff mentioned it would take months to fix hence a Topic Service Time and also alleges wrong diagnosis hence Service Diagnostics
                - Key topics: Staff ,Recall,Service Time,Service Diagnostics

                
            Return the output as mentioned in the structure. Reviews are numbered.'''},
            
        {'role': 'user',
            'content': "\n\n".join([str(indices_of_reviews[i]) + '. ' + reviews_to_analyze[i] for i in range(n_reviews_to_analyze)])}
    
    ],
    temperature=temperature,
    max_tokens=max_tokens,
    response_format=ListSchema
)



### View the Output

In [82]:
completion.choices[0].message.parsed.Results

[ReviewsSchema(Index=1136, Topics=['Behaviour'], Sentiment='Negative'),
 ReviewsSchema(Index=304, Topics=['Brake Issue', 'Service Issue', 'Repair Cost'], Sentiment='Negative'),
 ReviewsSchema(Index=986, Topics=['Service Quality'], Sentiment='Positive'),
 ReviewsSchema(Index=604, Topics=['Service Quality'], Sentiment='Positive'),
 ReviewsSchema(Index=1066, Topics=['Service Issue', 'Service Quality'], Sentiment='Positive'),
 ReviewsSchema(Index=314, Topics=['Service Quality', 'Communication'], Sentiment='Positive'),
 ReviewsSchema(Index=359, Topics=['Service Quality', 'Service Time'], Sentiment='Positive'),
 ReviewsSchema(Index=489, Topics=['Service Issue'], Sentiment='Negative'),
 ReviewsSchema(Index=624, Topics=['Service Quality'], Sentiment='Positive'),
 ReviewsSchema(Index=106, Topics=['Service Quality'], Sentiment='Positive'),
 ReviewsSchema(Index=1018, Topics=['Facilities'], Sentiment='Positive'),
 ReviewsSchema(Index=586, Topics=['Others'], Sentiment='Negative'),
 ReviewsSchema(In

In [83]:
results_list = []
for i in range(len(completion.choices[0].message.parsed.Results)):
    current_result = completion.choices[0].message.parsed.Results[i]
    results_list.append([current_result.Index, current_result.Topics, current_result.Sentiment, data["review_original"].iloc[current_result.Index]])
results_df = pd.DataFrame(data = results_list, columns = ['Review Index', 'Topics', 'Sentiment', 'Original Review'])
print(results_df.tail(31))

    Review Index  \
2            986   
3            604   
4           1066   
5            314   
6            359   
7            489   
8            624   
9            106   
10          1018   
11           586   
12          1096   
13           450   
14           391   
15           224   
16           619   
17           270   
18           554   
19            49   
20           707   
21           668   
22           372   
23           870   
24           549   
25            94   
26           687   
27           167   
28           453   
29           819   
30             0   
31             1   
32           147   

                                                            Topics Sentiment  \
2                                                [Service Quality]  Positive   
3                                                [Service Quality]  Positive   
4                                 [Service Issue, Service Quality]  Positive   
5                                 [Serv