Merge pull request #457 from gilcu3/vision-support

Vision models support
n3d1117 · Dec 10, 2023 · 05a7b5b · 05a7b5b
2 parents 4e16ad8 + 237705c
commit 05a7b5b
Show file tree

Hide file tree

Showing 9 changed files with 540 additions and 17 deletions.
diff --git a/.env.example b/.env.example
@@ -17,21 +17,25 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # TOKEN_PRICE=0.002
 # IMAGE_PRICES=0.016,0.018,0.02
 # TRANSCRIPTION_PRICE=0.006
+# VISION_TOKEN_PRICE=0.01
 # ENABLE_QUOTING=true
 # ENABLE_IMAGE_GENERATION=true
 # ENABLE_TTS_GENERATION=true
 # ENABLE_TRANSCRIPTION=true
+# ENABLE_VISION=true
 # PROXY=http://localhost:8080
 # OPENAI_MODEL=gpt-3.5-turbo
 # OPENAI_BASE_URL=https://example.com/v1/
 # ASSISTANT_PROMPT="You are a helpful assistant."
 # SHOW_USAGE=false
 # STREAM=true
 # MAX_TOKENS=1200
+# VISION_MAX_TOKENS=300
 # MAX_HISTORY_SIZE=15
 # MAX_CONVERSATION_AGE_MINUTES=180
 # VOICE_REPLY_WITH_TRANSCRIPT_ONLY=true
 # VOICE_REPLY_PROMPTS="Hi bot;Hey bot;Hi chat;Hey chat"
+# VISION_PROMPT="What is in this image"
 # N_CHOICES=1
 # TEMPERATURE=1.0
 # PRESENCE_PENALTY=0.0
@@ -41,9 +45,13 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # IMAGE_STYLE=natural
 # IMAGE_SIZE=1024x1024
 # IMAGE_FORMAT=document
+# VISION_DETAIL="low"
 # GROUP_TRIGGER_KEYWORD=""
 # IGNORE_GROUP_TRANSCRIPTIONS=true
+# IGNORE_GROUP_VISION=true
 # TTS_MODEL="tts-1"
 # TTS_VOICE="alloy"
 # TTS_PRICES=0.015,0.030
-# BOT_LANGUAGE=en
+# BOT_LANGUAGE=en
+# ENABLE_VISION_FOLLOW_UP_QUESTIONS="true"
+# VISION_MODEL="gpt-4-vision-preview"
diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ The following parameters are optional and can be set in the `.env` file:
 | `TOKEN_PRICE`         | $-price per 1000 tokens used to compute cost information in usage statistics. Source: https://openai.com/pricing                                                                                                                                                                                                                                                                          | `0.002`            |
 | `IMAGE_PRICES`        | A comma-separated list with 3 elements of prices for the different image sizes: `256x256`, `512x512` and `1024x1024`. Source: https://openai.com/pricing                                                                                                                                                                                                                                  | `0.016,0.018,0.02` |
 | `TRANSCRIPTION_PRICE` | USD-price for one minute of audio transcription. Source: https://openai.com/pricing                                                                                                                                                                                                                                                                                                       | `0.006`            |
+| `VISION_TOKEN_PRICE` | USD-price per 1K tokens of image interpretation. Source: https://openai.com/pricing                                                                                                                                                                                                                                                                                                       | `0.01`            |
 | `TTS_PRICES`          | A comma-separated list with prices for the tts models: `tts-1`, `tts-1-hd`. Source: https://openai.com/pricing                                                                                                                                                                                                                                                                            | `0.015,0.030`      |
 
 Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/184) for possible budget configurations.
@@ -86,17 +87,22 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
 | `ENABLE_IMAGE_GENERATION`          | Whether to enable image generation via the `/image` command                                                                                                                                                                                                                       | `true`                             |
 | `ENABLE_TRANSCRIPTION`             | Whether to enable transcriptions of audio and video messages                                                                                                                                                                                                                      | `true`                             |
 | `ENABLE_TTS_GENERATION`            | Whether to enable text to speech generation via the `/tts`                                                                                                                                                                                                                        | `true`                             |
+| `ENABLE_VISION`             | Whether to enable vision capabilities in supported models                                                                                                                                                                                                          | `true`                              |
 | `PROXY`                            | Proxy to be used for OpenAI and Telegram bot (e.g. `http://localhost:8080`)                                                                                                                                                                                                       | -                                  |
 | `OPENAI_MODEL`                     | The OpenAI model to use for generating responses. You can find all available models [here](https://platform.openai.com/docs/models/)                                                                                                                                              | `gpt-3.5-turbo`                    |
 | `OPENAI_BASE_URL`                  | Endpoint URL for unofficial OpenAI-compatible APIs (e.g., LocalAI or text-generation-webui)                                                                                                                                                                                       | Default OpenAI API URL             |
 | `ASSISTANT_PROMPT`                 | A system message that sets the tone and controls the behavior of the assistant                                                                                                                                                                                                    | `You are a helpful assistant.`     |
 | `SHOW_USAGE`                       | Whether to show OpenAI token usage information after each response                                                                                                                                                                                                                | `false`                            |
 | `STREAM`                           | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1                                                                                                                                                                                   | `true`                             |
 | `MAX_TOKENS`                       | Upper bound on how many tokens the ChatGPT API will return                                                                                                                                                                                                                        | `1200` for GPT-3, `2400` for GPT-4 |
+| `VISION_MAX_TOKENS`                       | Upper bound on how many tokens vision models will return                                                                                                                                                                                                            | `300` for gpt-4-vision-preview  |
+| `VISION_MODEL`                     | The Vision to Speech model to use. Allowed values: `gpt-4-vision-preview`                                                                                                                                                                                                         | `gpt-4-vision-preview`             |
+| `ENABLE_VISION_FOLLOW_UP_QUESTIONS`                     | If true, once you send an image to the bot, it uses the configured VISION_MODEL until the conversation ends. Otherwise, it uses the OPENAI_MODEL to follow the conversation. Allowed values: `true` or `false`                                                                                                                                                                                                         | `true`             |
 | `MAX_HISTORY_SIZE`                 | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage                                                                                                                                                          | `15`                               |
 | `MAX_CONVERSATION_AGE_MINUTES`     | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset                                                                                                                                                           | `180`                              |
 | `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` | Whether to answer to voice messages with the transcript only or with a ChatGPT response of the transcript                                                                                                                                                                         | `false`                            |
 | `VOICE_REPLY_PROMPTS`              | A semicolon separated list of phrases (i.e. `Hi bot;Hello chat`). If the transcript starts with any of them, it will be treated as a prompt even if `VOICE_REPLY_WITH_TRANSCRIPT_ONLY` is set to `true`                                                                           | -                                  |
+| `VISION_PROMPT`              | A phrase (i.e. `What is in this image`). The vision models use it as prompt to interpret a given image. If there is caption in the image sent to the bot, that supersedes this parameter                                                               | `What is in this image`                                   |
 | `N_CHOICES`                        | Number of answers to generate for each input message. **Note**: setting this to a number higher than 1 will not work properly if `STREAM` is enabled                                                                                                                              | `1`                                |
 | `TEMPERATURE`                      | Number between 0 and 2. Higher values will make the output more random                                                                                                                                                                                                            | `1.0`                              |
 | `PRESENCE_PENALTY`                 | Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far                                                                                                                                                                  | `0.0`                              |
@@ -106,8 +112,10 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
 | `IMAGE_QUALITY`                    | Quality of DALL·E images, only available for `dall-e-3`-model. Possible options: `standard` or `hd`, beware of [pricing differences](https://openai.com/pricing#image-models).                                                                                                    | `standard`                         |
 | `IMAGE_STYLE`                      | Style for DALL·E image generation, only available for `dall-e-3`-model. Possible options: `vivid` or `natural`. Check availbe styles [here](https://platform.openai.com/docs/api-reference/images/create).                                                                        | `vivid`                            |
 | `IMAGE_SIZE`                       | The DALL·E generated image size. Must be `256x256`, `512x512`, or `1024x1024` for dall-e-2. Must be `1024x1024` for dall-e-3 models.                                                                                                                                              | `512x512`                          |
+| `VISION_DETAIL`                       | The detail parameter for vision models, explained [Vision Guide](https://platform.openai.com/docs/guides/vision). Allowed values: `low` or `high`                                                                                                                                                                                  | `auto`                           |
 | `GROUP_TRIGGER_KEYWORD`            | If set, the bot in group chats will only respond to messages that start with this keyword                                                                                                                                                                                         | -                                  |
 | `IGNORE_GROUP_TRANSCRIPTIONS`      | If set to true, the bot will not process transcriptions in group chats                                                                                                                                                                                                            | `true`                             |
+| `IGNORE_GROUP_VISION`      | If set to true, the bot will not process vision queries in group chats                                                                                                                                                                                                | `true`                              |
 | `BOT_LANGUAGE`                     | Language of general bot messages. Currently available: `en`, `de`, `ru`, `tr`, `it`, `fi`, `es`, `id`, `nl`, `zh-cn`, `zh-tw`, `vi`, `fa`, `pt-br`, `uk`, `ms`, `uz`.  [Contribute with additional translations](https://github.com/n3d1117/chatgpt-telegram-bot/discussions/219) | `en`                               |
 | `WHISPER_PROMPT`                   | To improve the accuracy of Whisper's transcription service, especially for specific names or terms, you can set up a custom message.  [Speech to text - Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting)                                              | `-`                                |
 | `TTS_VOICE`                        | The Text to Speech voice to use. Allowed values: `alloy`, `echo`, `fable`, `onyx`, `nova`, or `shimmer`                                                                                                                                                                           | `alloy`                            |

diff --git a/bot/main.py b/bot/main.py
@@ -53,6 +53,11 @@ def main():
         'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
         'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
         'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
+        'vision_model': os.environ.get('VISION_MODEL', 'gpt-4-vision-preview'),
+        'enable_vision_follow_up_questions': os.environ.get('ENABLE_VISION_FOLLOW_UP_QUESTIONS', 'true').lower() == 'true',
+        'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'),
+        'vision_detail': os.environ.get('VISION_DETAIL', 'auto'),
+        'vision_max_tokens': int(os.environ.get('VISION_MAX_TOKENS', '300')),
         'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
         'tts_voice': os.environ.get('TTS_VOICE', 'alloy'),
     }
@@ -75,6 +80,7 @@ def main():
         'enable_quoting': os.environ.get('ENABLE_QUOTING', 'true').lower() == 'true',
         'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
         'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
+        'enable_vision': os.environ.get('ENABLE_VISION', 'true').lower() == 'true',
         'enable_tts_generation': os.environ.get('ENABLE_TTS_GENERATION', 'true').lower() == 'true',
         'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
         'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
@@ -84,9 +90,11 @@ def main():
         'voice_reply_transcript': os.environ.get('VOICE_REPLY_WITH_TRANSCRIPT_ONLY', 'false').lower() == 'true',
         'voice_reply_prompts': os.environ.get('VOICE_REPLY_PROMPTS', '').split(';'),
         'ignore_group_transcriptions': os.environ.get('IGNORE_GROUP_TRANSCRIPTIONS', 'true').lower() == 'true',
+        'ignore_group_vision': os.environ.get('IGNORE_GROUP_VISION', 'true').lower() == 'true',
         'group_trigger_keyword': os.environ.get('GROUP_TRIGGER_KEYWORD', ''),
         'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
         'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
+        'vision_token_price': float(os.environ.get('VISION_TOKEN_PRICE', '0.01')),
         'image_receive_mode': os.environ.get('IMAGE_FORMAT', "photo"),
         'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
         'tts_prices': [float(i) for i in os.environ.get('TTS_PRICES', "0.015,0.030").split(",")],