##**Install depedencies**

In [9]:
!pip install -q transformers num2words

##**Get pre-trained models**

In [10]:
import torch
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from transformers import VitsTokenizer, VitsModel, set_seed
from num2words import num2words

# Zero-shot object detection model
checkpoint = "google/owlv2-base-patch16-ensemble"
# detector = pipeline(model=checkpoint, task="zero-shot-object-detection")
model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint)
processor = AutoProcessor.from_pretrained(checkpoint)

model.save_pretrained("zero_shot_detection_model", from_pt=True)
processor.save_pretrained("zero_shot_detection_processor", from_pt=True)

[]

In [11]:
# Process the image

import torch
from PIL import Image

text_queries = ["red coffee cherry", "green coffee cherry"]
im = Image.open("/content/coffee-beans-on-a-branch-of-treered-and-green-arabica-coffee-beans-ripening-on-tree-in-coffeee-plantation-2C2GJE8.jpg")
inputs = processor(text=text_queries, images=im, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    target_sizes = torch.tensor([im.size[::-1]])
    results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)[0]

In [12]:
# Get results: counting coffee cherries base on the color.

scores = results["scores"].tolist()
labels = results["labels"].tolist()
boxes = results["boxes"].tolist()

label_result = []
for label in labels:
  label_result.append(text_queries[label])

# Count occurrences of each label
red_cherry_count = 0
green_cherry_count = 0
for item in label_result:
    if item == 'red coffee cherry':
        red_cherry_count += 1
    else:
        green_cherry_count += 1

# Display counts
print("Red coffee cherries count:", red_cherry_count)
print("Green coffee cherries count:", green_cherry_count)

Red coffee cherries count: 108
Green coffee cherries count: 175


In [13]:
# Text-to-speech model

tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-vie")
model_tts = VitsModel.from_pretrained("facebook/mms-tts-vie")

tokenizer.save_pretrained("tokenizer_tts", from_pt=True)
model_tts.save_pretrained("model_tts", from_pt=True)

inputs = tokenizer(text=f"Có {num2words(int(red_cherry_count), lang='vi')} hạt cà phê đỏ và {num2words(int(green_cherry_count), lang='vi')} hạt cà phê xanh", return_tensors="pt")

set_seed(555)  # make deterministic

with torch.no_grad():
   outputs = model_tts(**inputs)

waveform = outputs.waveform[0]

In [15]:
num2words(int(green_cherry_count), lang='vi')

'một trăm bảy mươi lăm'

In [17]:
from IPython.display import Audio

Audio(waveform, rate=model_tts.config.sampling_rate)

# Save as an audio file
# import scipy

# scipy.io.wavfile.write("result.wav", rate=model.config.sampling_rate, data=waveform)

In [18]:
# Get the total number of cherries:

number_of_tree = input(int())
print(f"The number of coffee trees is: {number_of_tree}")
print(f"Total of red coffee cherries: {int(red_cherry_count) * int(number_of_tree)}")
print(f"Total of green coffee cherries: {int(green_cherry_count) * int(number_of_tree)}")

0125
The number of coffee trees is: 125
Total of red coffee cherries: 13500
Total of green coffee cherries: 21875
