# Text Classification Dataset Generation with minimal configuration
-

In [1]:
LABELS = [
    {
        'name': 'vebose',
        'description': "Verbose in text is often includes some redundancy, filler words, excessive qualifiers, unnecessary adjectives or adverbs, irrelevant information, and repetition of known context."
    },
    {
        'name': 'concise',
        'description': "Concise text is characterized by clarity, precision, and brevity. It communicates ideas directly, in a very compact manner using only the words necessary to convey the intended message."
    }
]

In [2]:
LANGUAGES = {
    "fr": "French",
    "es": "Spanish",
    "en": "English"
}

In [5]:
!pip install instructor datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m399.4/480.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source 

In [4]:
import google.generativeai as genai
from google.colab import userdata
from pydantic import BaseModel
import instructor
import random
import uuid
import time
import os

In [29]:
MODEL_ID = 'gemini-1.5-flash'

In [30]:
# Setup your API in Colab Secrets and read it here. Pass it to genai to interact with Gemini.
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [31]:
# let's instantiate a model and try it
model = genai.GenerativeModel(model_name=MODEL_ID)

In [32]:
labels_listing = [label['name'] for label in LABELS]
# labels_listing = ', '.join(labels_listing)

In [33]:
# GENERATED_PROMPT_TEMPLATE = """I am creating a dataset to train a classifier that distinguishes verbose texts from concise texts. Your task is to generate verbose, realistic, and professional texts. Each example should provide rich detail and context without exaggeration or redundancy. Avoid flowery language and unnecessary embellishments while ensuring the tone remains professional and informative.

# Generate {number} examples of verbose texts. Each example should:

# Be a realistic, coherent response to an imagined query, scenario, or description.
# Include detailed explanations, background information, and illustrative examples.
# Use longer sentence structures and provide additional context where necessary.
# For example:

# A detailed explanation of a process or concept.
# A comprehensive description of an event or scenario.
# A professional email or message containing all relevant details.
# Etc.

# Avoid repeating ideas across examples and ensure each one is unique and comprehensive."""

In [34]:
NUM_SAMPLES = 10

In [35]:
PROMPT_TEMPLATE = """I need text examples in order to train a machine learning model to classify between the following classes {labels_listing}. \
Your task is to generate {num_samples} texts which are diverse and representative of what could be encountered \
for the '{label_name}' class . {label_description}. Do not exagerate, and ensure that while it belongs to the described class, it is realistic."""

In [36]:
NUM_SAMPLES = 10

PROMPTS = [PROMPT_TEMPLATE.format(
    num_samples=NUM_SAMPLES,
    labels_listing=labels_listing,
    label_name=LABEL['name'],
    label_description=LABEL['description']) for LABEL in LABELS]

In [37]:
PROMPTS[0]

"I need text examples in order to train a machine learning model to classify between the following classes ['vebose', 'concise']. Your task is to generate 10 texts which are diverse and representative of what could be encountered for the 'vebose' class . Verbose in text is often includes some redundancy, filler words, excessive qualifiers, unnecessary adjectives or adverbs, irrelevant information, and repetition of known context.. Do not exagerate, and ensure that while it belongs to the described class, it is realistic."

In [38]:
response = model.generate_content(PROMPTS[0])

In [39]:
print(response.candidates[0].content.parts[0].text)

Here are 10 examples of verbose text, aiming for realism and avoiding exaggeration:

1.  "The meeting, which was scheduled for Tuesday morning, at approximately 9:00 AM,  was, in my opinion, quite unproductive.  There were, frankly, a lot of people there, and, well,  many of them seemed to have a lot of, shall we say,  unnecessary things to say, which, you know,  took up a considerable amount of time."

2.  "So, basically, what happened was,  the project, it was a really big project, you know, a huge undertaking, and it involved, like, tons of different people,  and a lot of them had different opinions,  and so,  it was kind of a long and drawn-out process, to put it mildly,  before we finally got it completed."

3.  "The report, which I've painstakingly prepared,  details the findings of our investigation, which, as you may recall, was quite extensive. We looked at many different aspects, several of them quite in detail, and the overall conclusion is that, it seems to me, further rese

In [71]:
response = model.generate_content(GENERATED_PROMPT_TEMPLATE.format(number=10))

In [73]:
print(response.candidates[0].content.parts[0].text)

1. **Subject: Comprehensive Analysis of Q3 Sales Performance**

This report details the performance of the sales department during the third quarter of the fiscal year, encompassing a comprehensive analysis of key performance indicators (KPIs) and contributing factors.  Sales figures fell short of projected targets by 7.2%, primarily attributable to a slowdown in the Western region, which experienced a 12% decrease compared to Q2.  This decline correlates with the nationwide economic downturn observed in August and September, impacting consumer spending habits and delaying large-scale procurement decisions within key client segments.  Detailed regional breakdowns are available in Appendix A, alongside a comparative analysis of marketing campaign effectiveness during this period.  We recommend implementing the revised sales strategy outlined in Section 3, focusing on targeted outreach to key accounts and the development of new product offerings tailored to the current economic climate. 

In [40]:
CONCISE_GENERATED_PROMPT_TEMPLATE = """I am creating a dataset to train a classifier that distinguishes concise texts from verbose texts. Your task is to generate concise, clear, and professional texts. Each example should communicate the essential information directly, avoiding unnecessary elaboration or redundant details. The tone should remain professional and precise.

Generate {number} examples of concise texts. Each example should:

Be a realistic, coherent response to an imagined query, scenario, or description.
Focus only on the key points and avoid adding extra context, explanations, or non-essential details.
Use short, clear sentence structures to convey the message efficiently. Although short, don't cut the meet. Conciseness mean precise and specific in our case.
For example:

A brief explanation of a process or concept.
A high-level summary of an event or scenario.
A concise email or message stating only the critical information.
Avoid repeating ideas across examples and ensure each one is unique and succinct."""

In [41]:
response = model.generate_content(CONCISE_GENERATED_PROMPT_TEMPLATE.format(number=10))
print(response.candidates[0].content.parts[0].text)

1. **Project Status:**  Deployment delayed.  New deadline: October 27th.  Addressing critical bug.

2. **Meeting Summary:**  Budget approved. Marketing plan finalized. Next steps:  Launch campaign.

3. **Sales Report Q3:** Revenue exceeded projections.  Increased market share.  Strong customer acquisition.

4. **System Failure:** Power surge caused server outage.  System restored.  Data integrity confirmed.

5. **Candidate Feedback:** Strong technical skills. Lacks experience in project management.  Further interview recommended.

6. **Travel Itinerary:** Flight departs 8 AM.  Hotel reservation confirmed.  Meeting scheduled 2 PM.

7. **Order Confirmation:**  Order #12345 processed.  Shipping confirmation follows.  Expected delivery: 3-5 business days.

8. **Incident Report:**  Employee injury.  Minor laceration.  First aid administered.  OSHA report filed.

9. **Proposal Summary:**  Solution addresses key challenges.  Cost-effective implementation.  Detailed proposal attached.

10. **R

In [42]:
from pydantic import BaseModel, Field

class TextEntries(BaseModel):
    entries: list[str] = Field(
        ...,
        description="List of texts"
    )

# we can access the JSON Schema
TextEntries.model_json_schema()

{'properties': {'entries': {'description': 'List of texts',
   'items': {'type': 'string'},
   'title': 'Entries',
   'type': 'array'}},
 'required': ['entries'],
 'title': 'TextEntries',
 'type': 'object'}

In [43]:
google_client = genai.GenerativeModel(
    model_name=MODEL_ID)

client = instructor.from_gemini(
    client=google_client,
    mode=instructor.Mode.GEMINI_JSON,
)

In [44]:
messages = [
    {
        "role": "system",
        "content": "You are a helpful AI Assistant"
    },
    {
        "role": "user",
        "content": prompt_generation
    }
]

messages

NameError: name 'prompt_generation' is not defined

In [45]:
response = client.messages.create(
    messages=messages,
    response_model=TextEntries
)

NameError: name 'messages' is not defined

In [68]:
for entry in response.entries:
    print(entry, '\n\n')

The quick brown rabbit, which was incredibly fast and agile, jumped over the lazy fox, which was, to put it mildly, quite lazy and unmotivated.  It was a truly remarkable feat of athleticism, considering the, uh, well, the fox's lack of exertion. 


In the realm of culinary arts, the preparation of a simple omelet, while seemingly straightforward, can actually involve a multitude of nuanced steps and considerations.  Firstly, you have to carefully select your eggs, ensuring they're the freshest, most superior eggs available.  Then, the cooking process itself, oh my goodness, requires very precise temperature control and, of course, deft wrist action. And finally, you might even add some extra ingredients. For example, you could add cheese or herbs. 


It is my considered opinion, based on a long period of careful observation and exhaustive study, that the current economic climate presents numerous challenges and also some unique and exciting opportunities.  However, it's also important