## Loading the data

In [8]:
import pandas as pd
df_ = pd.read_csv("../input/tata-capital-case-study/train_40k.csv")

In [9]:
df_.head()

Unnamed: 0,productId,Title,userId,Helpfulness,Score,Time,Text,Cat1,Cat2,Cat3
0,B000E46LYG,Golden Valley Natural Buffalo Jerky,A3MQDNGHDJU4MK,0/0,3,-1,The description and photo on this product need...,grocery gourmet food,meat poultry,jerky
1,B000GRA6N8,Westing Game,unknown,0/0,5,860630400,This was a great book!!!! It is well thought t...,toys games,games,unknown
2,B000GRA6N8,Westing Game,unknown,0/0,5,883008000,"I am a first year teacher, teaching 5th grade....",toys games,games,unknown
3,B000GRA6N8,Westing Game,unknown,0/0,5,897696000,I got the book at my bookfair at school lookin...,toys games,games,unknown
4,B00000DMDQ,I SPY A is For Jigsaw Puzzle 63pc,unknown,4-Feb,5,911865600,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles,jigsaw puzzles


In [10]:
cols_use = ['Text','Cat1','Cat2','Cat3']
df_ = df_[cols_use]

In [11]:
df_.head()

Unnamed: 0,Text,Cat1,Cat2,Cat3
0,The description and photo on this product need...,grocery gourmet food,meat poultry,jerky
1,This was a great book!!!! It is well thought t...,toys games,games,unknown
2,"I am a first year teacher, teaching 5th grade....",toys games,games,unknown
3,I got the book at my bookfair at school lookin...,toys games,games,unknown
4,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles,jigsaw puzzles


## Data Analysis

In [17]:
## Checking overall dataframe and unique classes
df_.describe()

Unnamed: 0,Text,Cat1,Cat2,Cat3
count,40000,40000,40000,40000
unique,39488,6,64,464
top,Length:,toys games,personal care,unknown
freq,28,10266,2852,2262


In [12]:
## Evaluating Cat1 and its 6 unique classes
df_.Cat1.value_counts()

Cat1
toys games              10266
health personal care     9772
beauty                   5846
baby products            5637
pet supplies             4862
grocery gourmet food     3617
Name: count, dtype: int64

In [14]:
## We can select one of the high-level Category Cat1 and look into sub-Category Cat2 (For toys games)
df_[df_.Cat1 == "toys games"].Cat2.value_counts()

Cat2
games                      1525
electronics for kids       1073
action toy figures         1066
baby toddler toys          1041
dolls accessories           922
stuffed animals plush       624
sports outdoor play         575
dress up pretend play       528
learning education          462
vehicles remote control     456
building toys               412
arts crafts                 361
tricycles                   299
novelty gag toys            299
puzzles                     273
hobbies                     210
grown up toys               140
Name: count, dtype: int64

In [20]:
## We can select one of the high-level Category of Cat2 and look into sub-Category of Cat3 (For games)
df_[df_.Cat2 == "games"].Cat3.value_counts()

Cat3
board games                    924
card games                     205
unknown                        119
trading card games              98
battling tops                   44
dvd games                       40
standard playing card decks     29
tile games                      24
game accessories                 9
stacking games                   8
dice gaming dice                 8
floor games                      6
game room games                  5
travel games                     4
game collections                 1
handheld games                   1
Name: count, dtype: int64

In [15]:
## There are few Category which are unknown or blank which we might need to take care in prompt
df_.Cat3.value_counts()

Cat3
unknown                    2262
shaving hair removal       1565
vitamins supplements       1315
board games                 924
styling tools               850
                           ... 
stuffing                      1
breadsticks                   1
eggs                          1
chocolate covered fruit       1
spices gifts                  1
Name: count, Length: 464, dtype: int64

## Evaluating Category 1

### Prompting technique for category 1 using open AI GPT 3.5 model

In [87]:
instruction = "You are an expert classifier.You will be provided with customer reviwe data.Classify query into Categories mentioned below.Only respond with category name."
Categories = """
- toys games
- health personal care
- beauty
- baby products
- pet supplies
- grocery gourmet food
"""
task_specific_prompt = "output only category name and do not provide reason or explanation.Select only from the Categories mentioned above."

In [88]:
## Preparing prompt template
prompt_template = """{instruction}

# Categories
{Categories}

{task_specific_prompt}
# Text input
Text:{user_input}
Category:
"""

In [89]:
import openai
openai.api_key = < open AI key>

## Function to format prompt
def format_prompt(instruction,Categories,task_specific_prompt,user_input):
    
    formatted_prompt = prompt_template.format(instruction = instruction,
                                              Categories=Categories,
                                              task_specific_prompt=task_specific_prompt,
                                              user_input=user_input)
    
    return formatted_prompt

## Run GPT 3.5 with temp=0 
def get_response_with_context(final_prompt):
    
    COMPLETIONS_MODEL = "gpt-3.5-turbo"
    
    
    result = openai.chat.completions.create(model=COMPLETIONS_MODEL,
                                          messages=[{"role": "user", "content": final_prompt}],
                                          temperature=0, 
                                          max_tokens=10,
                                          top_p=1,
                                          frequency_penalty=0.2,
                                          presence_penalty=0.6,
                                          stop=None
                                          )

    response = result.choices[0].message.content
    
    return response
 
## Invoke functions to get final answer
def get_answer(instruction,Categories,task_specific_prompt,user_input):
    
    refine_prompt = format_prompt(instruction,Categories,task_specific_prompt,user_input)
    answer = get_response_with_context(refine_prompt)
    
    return answer

In [90]:
# Lets try a input
user_input = """The description and photo on this product needs to be changed to indicate this product is the BuffalOs version of this beef jerky."""
get_answer(instruction,Categories,task_specific_prompt,user_input)

'grocery gourmet food'

## Evaluate Category 1 Classifier with few diverse examples

In [66]:
## Lets sample 10 examples from each class
df_sampled_cat1 = df_[['Text','Cat1']].groupby('Cat1').apply(lambda x: x.sample(10)).reset_index(drop=True)

  df_sampled_cat1 = df_[['Text','Cat1']].groupby('Cat1').apply(lambda x: x.sample(10)).reset_index(drop=True)


In [91]:
## Run though GPT3.5 to get predictions
predicted_category_l = []
for _,row in df_sampled_cat1.iterrows():
    user_input = row.Text
    predicted_category = get_answer(instruction,Categories,task_specific_prompt,user_input)
    print(_)
    predicted_category_l.append(predicted_category)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


In [92]:
df_sampled_cat1['Predicted_category'] = predicted_category_l

In [97]:
## We can look into text where model misclassified reviews
df_sampled_cat1[df_sampled_cat1.Cat1 != df_sampled_cat1.Predicted_category]

Unnamed: 0,Text,Cat1,Predicted_category
5,Best to buy this in your current weight range ...,baby products,health personal care
9,This is our son's favorite toy. There is a mot...,baby products,toys games
10,"These things are good in theory, but you need ...",beauty,home products
12,True waste of money. Just burns your skin its ...,beauty,health personal care
15,i got these for travel. With all the restricti...,beauty,health personal care
23,Some of the reviewers eat these for weight con...,grocery gourmet food,health personal care
34,This thing is great. Our vent was not totally ...,health personal care,home products
36,"I really like these gloves, but finding them o...",health personal care,grocery gourmet food
37,My experience corroborates what earlier review...,health personal care,toys games
38,replacement / back up part for remote extender...,health personal care,toys games


In [102]:
df_sampled_cat1.iloc[59].Text

'I too was looking for a toy for my 9 month old to learn to sooth himself back to sleep. The glow worm was my first choice but had to return it because the music it too loud. For play during the day the volume is fine but our problem was - when my little guy fell a sleep with it and would roll over and/or bump it accidentally, the music would go off. Being that it was SO LOUD it would wake him & arouse him. He is the youngest of 3 boys so he is used to noise. We also keep an air purifier in his room that creates plenty of white noise and this was still a problem. This toy was suppose help sooth him back to sleep. Instead it would wake him up. I don\'t know who the people are who make these toys (obviously not parents of young sleeping children) but you\'d think common sense would tell you that if you make a "Sleeping toy" - it really needs a volume botton!! As all babies, people & situations are different. This toy also desperately needs a switch where you can turn off the music but st

In [107]:
# Lets check accuracy 

(1-(df_sampled_cat1[df_sampled_cat1.Cat1 != df_sampled_cat1.Predicted_category].shape[0]/df_sampled_cat1.shape[0]))*100

81.66666666666667

In [109]:
confusion_matrix = pd.crosstab(df_sampled_cat1['Cat1'], df_sampled_cat1['Predicted_category'], rownames=['Actual'], colnames=['Predicted'])

In [110]:
## Confusion matrix for Category 1 to understand where we can improve the model
confusion_matrix

Predicted,baby products,beauty,grocery gourmet food,health personal care,home products,pet supplies,toys games
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
baby products,8,0,0,1,0,0,1
beauty,0,7,0,2,1,0,0
grocery gourmet food,0,0,9,1,0,0,0
health personal care,0,0,1,6,1,0,2
pet supplies,0,0,0,0,0,10,0
toys games,1,0,0,0,0,0,9


## Category 2 Classifier with few diverse examples for pet supplies

In [112]:
## Lets look into unique classes in pet supplies
df_[df_.Cat1 == "pet supplies"].Cat2.value_counts()

Cat2
dogs                    2610
cats                    1779
fish aquatic pets        294
birds                     99
bunny rabbit central      51
small animals             29
Name: count, dtype: int64

In [115]:
## Lets sample random 10 samples from each of the sub-class
pet_supplies = df_[df_.Cat1 == "pet supplies"].copy(deep=True)
df_sampled_cat2 = pet_supplies[['Text','Cat2']].groupby('Cat2').apply(lambda x: x.sample(10)).reset_index(drop=True)

  df_sampled_cat2 = pet_supplies[['Text','Cat2']].groupby('Cat2').apply(lambda x: x.sample(10)).reset_index(drop=True)


In [117]:
df_sampled_cat2.Cat2.value_counts()

Cat2
birds                   10
bunny rabbit central    10
cats                    10
dogs                    10
fish aquatic pets       10
small animals           10
Name: count, dtype: int64

In [118]:
## Writing custom instructions and categories for this class
instruction = "You are an expert in pet supplies.You will be provided with customer reviwe data of pet supplies product.Classify query into Categories mentioned below.Only respond with category name."
Categories = """
- birds
- bunny rabbit central
- cats
- dogs
- fish aquatic pets
- small animals
"""

In [122]:
predicted_category_l = []
for _,row in df_sampled_cat2.iterrows():
    user_input = row.Text
    predicted_category = get_answer(instruction,Categories,task_specific_prompt,user_input)
    print(_)
    predicted_category_l.append(predicted_category)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


In [123]:
df_sampled_cat2['Predicted_category'] = predicted_category_l

In [124]:
df_sampled_cat2

Unnamed: 0,Text,Cat2,Predicted_category
0,We got this for our Quaker. It took him a whil...,birds,birds
1,Thank you for fast shipment and great product....,birds,birds
2,Like us when our pets eat healthy they have fe...,birds,dogs
3,My parrots love to jump and hang upside down o...,birds,birds
4,I bought one of these at petsmat for my jenday...,birds,birds
5,It looked bigger and beatiful. When I received...,birds,birds
6,My 25 year old Amazon has converted to Harriso...,birds,birds
7,This is a beautiful cage. It is quite large (i...,birds,birds
8,"My cockatoo has her cage in my sewing room, wh...",birds,birds
9,We usually hang the canary cage from the ceili...,birds,birds


In [125]:
df_sampled_cat2[df_sampled_cat2.Cat2 != df_sampled_cat2.Predicted_category]

Unnamed: 0,Text,Cat2,Predicted_category
2,Like us when our pets eat healthy they have fe...,birds,dogs
13,I was very happy happy with purchase. I will k...,bunny rabbit central,cats
14,I have 2 chinchillas and they always want to p...,bunny rabbit central,small animals
15,I bought this product for my pygmy hedgehog an...,bunny rabbit central,small animals
16,I have three Guinea Pigs that are extremely hy...,bunny rabbit central,small animals
17,I'm about to get my third one of these. The me...,bunny rabbit central,small animals
18,The bowl is incredibly cute and can withstand ...,bunny rabbit central,small animals
23,Great for thick fur with heavy down coat. We h...,cats,dogs
25,Needs better cap.Just can't get it tight enoug...,cats,fish aquatic pets
26,We got the large round metal tags for our two ...,cats,dogs


In [126]:
# Accuracy 
(1-(df_sampled_cat2[df_sampled_cat2.Cat2 != df_sampled_cat2.Predicted_category].shape[0]/df_sampled_cat2.shape[0]))*100

73.33333333333334

In [127]:
## Confusion matrix for Category 1 to understand where we can improve the model
confusion_matrix = pd.crosstab(df_sampled_cat2['Cat2'], df_sampled_cat2['Predicted_category'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,birds,bunny rabbit central,cats,dogs,fish aquatic pets,small animals
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
birds,9,0,0,1,0,0
bunny rabbit central,0,4,1,0,0,5
cats,0,0,5,4,1,0
dogs,0,0,1,9,0,0
fish aquatic pets,1,0,0,0,9,0
small animals,0,1,0,1,0,8
