In [16]:
from groq import Groq
from dotenv import load_dotenv
import re
import pandas as pd

In [12]:
messages = [
    {
        "role": "system",
        "content": (
            "You are a highly capable assistant that generates synthetic datasets for machine learning tasks. "
            "Your role is to provide given formatted outputs for ease of processing. "
            "Each dataset should align with the user's specifications."
        )
    },
    {
        "role": "user",
        "content": (
            "Generate a synthetic dataset of student reviews about campus events or amenities. "
            "The reviews should reflect diverse sentiments (positive, negative, neutral) and a mix of tones. "
            "Ensure that each review includes the following fields: \n\n"
            "- `review_text`: The actual review text.\n"
            "- `sentiment`: The sentiment of the review (values: 'positive', 'negative', or 'neutral').\n"
            "Output the dataset in the following format:\n\n"
            "```\n"
            "review_text: str\n"
            "sentiment: str\n"
            "review_text: str\n"
            "sentiment: str\n"
            "```\n"
            "Generate at least 100 reviews with a balanced distribution of positive, negative, and neutral sentiments."
        )
    }
]


In [13]:
messages

[{'role': 'system',
  'content': "You are a highly capable assistant that generates synthetic datasets for machine learning tasks. Your role is to provide given formatted outputs for ease of processing. Each dataset should align with the user's specifications."},
 {'role': 'user',
  'content': "Generate a synthetic dataset of student reviews about campus events or amenities. The reviews should reflect diverse sentiments (positive, negative, neutral) and a mix of tones. Ensure that each review includes the following fields: \n\n- `review_text`: The actual review text.\n- `sentiment`: The sentiment of the review (values: 'positive', 'negative', or 'neutral').\nOutput the dataset in the following format:\n\n```\nreview_text: str\nsentiment: str\nreview_text: str\nsentiment: str\n```\nGenerate at least 100 reviews with a balanced distribution of positive, negative, and neutral sentiments."}]

In [24]:
load_dotenv()

client = Groq()
completion = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=messages,
    temperature=1,
    max_tokens=1024,
    top_p=1,
    stream=True,
    stop=None,
)


In [25]:
chunks = []
for chunk in completion:
    chunks.append(chunk.choices[0].delta.content or "")
    print(chunk.choices[0].delta.content or "", end="")

review_text: I absolutely loved the recent concert on campus, the energy was electric and the performers were amazing!
sentiment: positive
review_text: The new gym is nice, but it's always crowded and the equipment is often broken.
sentiment: negative
review_text: The campus library is a good place to study, it's quiet and has a lot of resources.
sentiment: neutral
review_text: I was really disappointed with the food options at the last campus event, they were limited and overpriced.
sentiment: negative
review_text: The student union building is a great place to hang out and relax, it has a nice atmosphere and comfortable seating.
sentiment: positive
review_text: The campus Wi-Fi is okay, it's not the fastest but it gets the job done.
sentiment: neutral
review_text: I had a great time at the recent comedy show on campus, the comedian was hilarious and the audience was engaged.
sentiment: positive
review_text: The parking on campus is a nightmare, it's always full and the permits are ex

In [26]:
# Sample output from LLM
data = ''.join(chunks)


# Step 1: Use regex to extract review_text and sentiment
pattern = r"review_text:\s*(.+?)\s*sentiment:\s*(positive|negative|neutral)"
matches = re.findall(pattern, data, re.DOTALL)

# Step 2: Create a DataFrame
df = pd.DataFrame(matches, columns=["review_text", "sentiment"])

# Display the DataFrame
df


Unnamed: 0,review_text,sentiment
0,I absolutely loved the recent concert on campu...,positive
1,"The new gym is nice, but it's always crowded a...",negative
2,"The campus library is a good place to study, i...",neutral
3,I was really disappointed with the food option...,negative
4,The student union building is a great place to...,positive
5,"The campus Wi-Fi is okay, it's not the fastest...",neutral
6,I had a great time at the recent comedy show o...,positive
7,"The parking on campus is a nightmare, it's alw...",negative
8,"The campus counseling services are helpful, bu...",neutral
9,"I love the new coffee shop on campus, it has a...",positive


In [27]:
df.to_csv('student_review.csv', index=False)