In [7]:
import os
import json
import random
import string

from huggingface_hub import InferenceClient
from dotenv import load_dotenv
from datasets import Dataset

from piidd.data_generation.utils import (
    relations,
    bios,
    first_names,
    inspirational_people,
    prompts,
    random_string,
)

load_dotenv("../../.env")
client = InferenceClient(token=os.environ["HF_TOKEN"])

In [10]:
len(first_names)

5132

In [9]:
def make_guidelines():

    guidelines = [
        "Write in 1st person",
        "Do not list references",
        "Mention your personal website in the middle of the essay and use <<URL>> as a placeholder.",
        "Length: 500 words",
        "Do not include a preamble",
    ]

    # `relation_prob`% of the time, mention either a friend, colleague, family member, mentor, coworker, or teammate.

    relation_prob = 1.0

    names = []
    if random.random() < relation_prob:
        relation = random.choice(relations)

        # 50% of the time, add a second name
        if random.random() < 0.5:
            names = random.sample(first_names, 2)
            relation += "s"
        else:
            names = [random.choice(first_names)]

        name_str = " and ".join(names)

        guidelines.append(f"Mention somewhere in the essay how your {relation} {name_str} helped you")


    # `relation_prob`% of the time, mention an inspirational person
    relation_prob = 1.0

    famous_person = ""
    if random.random() < relation_prob:

        famous_person = random.choice(inspirational_people)

        guidelines.append(f"Mention somewhere in the essay how {famous_person} inspired you")


    guidelines = "# Guidelines\n" + "\n- ".join(guidelines)

    return {
        "guidelines": guidelines,
        "names": "|".join(names),
        "famous_person": famous_person,
    }

In [10]:
def generate_essay(example):
    guidelines = make_guidelines()

    bio = "\n\nWrite the essay as a person with the following description:\n" + random.choice(bios["bio"])

    p = random.choice(prompts)
    full_prompt = "<s> [INST] " + p + "\n" + bio + "\n" + guidelines["guidelines"] + " [/INST]\nEssay:\n"

    try:
        r = client.post(
            json={
                "inputs": full_prompt,
                "parameters": {
                    "max_new_tokens": 3000,
                    "top_k": 50,
                    "temperature": 1.0,
                    "return_full_text": False,
                },
                "options": {"use_cache": False},
            },
            model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        )

        essay = json.loads(r.decode())[0]["generated_text"]

    except Exception as e:
        print(e)
        essay = "<|Error|>"

    return {
        "essay": essay,
        "prompt": p,
        "bio": bio,
        **guidelines,
    }

In [11]:
for i in range(10, 15):
    ds = Dataset.from_dict({"id": [random_string(8) for _ in range(200)]})
    ds1 = ds.map(generate_essay, num_proc=16)
    ds1.to_parquet(f"mixtral-200-<<URL>>-part{i}.pq")

Map (num_proc=16):   0%|          | 0/200 [00:00<?, ? examples/s]

TimeoutError: 

In [None]:
ds1.shuffle()[0]

{'id': 'fyupjkef',
 'essay': "\nAs a Credit Analyst, I frequently face the challenge of evaluating a potential client's creditworthiness and determining their risk exposure. This task requires sifting through substantial financial data, assessing trends, and making recommendations based on my findings. Recently, I encountered a particularly challenging case where a potential client had a complex financial history, with numerous assets and liabilities spread across various sectors. Existing approaches, such as analyzing balance sheets and income statements, proved insufficient in providing a comprehensive understanding of the client's financial health.\n\nTo tackle this challenge, I turned to the design tool of visualization, specifically creating a detailed financial map to represent the client's financial landscape. I chose visualization because it allows for a more intuitive understanding of complex data, making it easier to identify patterns and connections that may be missed throug

In [None]:
ds2.to_parquet("mixtral-v1-100")

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

847532

In [1]:
from piidd.data_generation.add_info import add_info_to_mixtral
from pathlib import Path
import os
from datasets import Dataset

home_dir = Path(os.environ["PROJECT_HOME_DIR"])

ds = Dataset.from_parquet(str(home_dir / "data/essays/mixtral-v1a.pq"))
ds

Loaded .env file!


Dataset({
    features: ['id', 'essay', 'prompt', 'bio', 'guidelines', 'names', 'famous_person'],
    num_rows: 2665
})

In [2]:
x = [add_info_to_mixtral(example, url_pattern = "<<.*>>") for example in ds]

In [None]:
import re
y = x[1906]

value = y["phone_number"]

e = y['essay']

list(re.finditer(re.escape(value), e))

[<re.Match object; span=(21, 36), match='+1-691-969-0864'>,
 <re.Match object; span=(1928, 1943), match='+1-691-969-0864'>,
 <re.Match object; span=(4136, 4151), match='+1-691-969-0864'>]

In [None]:
x[2]

{'name': 'Athanase Keil',
 'address': '675 Ricky Expressway Apt. 921\nLake Jason, GA 37487',
 'username': 'behnammatt',
 'email': 'aroldedavey@uwm.edu',
 'id_num': '58,8,16,68',
 'phone_number': '722-833-9015',
 'social': 'https://www.twitter.com/estelle-mayer',
 'personal_url': 'https://norlisha-deleon.github.io/about-hobby',
 'essay': 'Athanase Keil, Pin No: 58,8,16,68 | telephone 722-833-9015; media https://www.twitter.com/estelle-mayer\n\n\nAs a journalist, I have always been committed to unearthing the truth and presenting it in a fair and balanced manner. Over the years, I have encountered many complex problems that require more than just standard reporting techniques. In this essay, I will discuss how I applied a specific methodology to tackle one such problem and the outcomes that ensued.\n\nThe Problem:\n\nIn my community, there was a growing concern about the rising number of homeless individuals and families. The problem was complex due to various factors such as unemploymen

In [3]:
from piidd.data_generation.add_info import to_tokens

tokenized = [to_tokens(example) for example in x]
tokenized = [{**x, "document": i} for i, x in enumerate(tokenized)]

In [None]:
print([x for x in tokenized if x["document"] == 4])

[{'tokens': [' ', 'name', 'Eunice', 'Kelleher', '-', 'https://www.linkedin.com/luvio_kyle', '\n', 'elianas', '-', '885.810.0378', '\n', '3046', 'Stein', 'Lock', 'Suite', '715', '\n', 'Grayburgh', ',', 'MN', '74279', '\n ', 'e', '-', 'mail', ':', 'abercrombie57@uky.edu', ',', 'NUM', '.', ':', 'JN7YIAiCYcN', '\n\n\n', 'As', 'a', 'Marine', 'Sales', 'Representative', ',', 'I', "'ve", 'always', 'taken', 'pride', 'in', 'my', 'ability', 'to', 'build', 'and', 'maintain', 'strong', 'relationships', 'with', 'clients', '.', 'However', ',', 'I', 'faced', 'a', 'significant', 'challenge', 'when', 'I', 'noticed', 'that', 'a', 'long', '-', 'time', 'client', 'was', 'struggling', 'to', 'remain', 'competitive', 'in', 'the', 'market', '.', 'Their', 'existing', 'marine', 'products', 'were', 'outdated', ',', 'and', 'they', 'needed', 'a', 'solution', 'that', 'would', 'help', 'them', 'modernize', 'their', 'fleet', 'while', 'staying', 'within', 'their', 'budget', '.', '\n\n', 'Existing', 'approaches', 'and', '

In [None]:
1846

In [4]:
import json

json.dump(tokenized, open("mixtral-v1a.json", "w"))

# Mistral Medium

In [None]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

api_key = os.environ["MISTRAL_KEY"]
model = "mistral-medium"

client = MistralClient(api_key=api_key)

text = """Write an essay (in 1st person perspective) with the below prompt as the following person: 
 The E-commerce Specialist in the retail industry is a highly analytical and detail-oriented individual who excels in driving online sales and optimizing the digital customer experience. With a natural curiosity and a passion for experimentation, they are constantly seeking new approaches to increase e-commerce KPIs, employing data-driven strategies to improve conversion rates, and staying ahead of emerging trends in the rapidly evolving world of online retail. They possess exceptional problem-solving skills and are able to effectively communicate and collaborate with cross-functional teams, ultimately ensuring the e-commerce channel contributes significantly to the overall success of the retail business.

**Essay Prompt: Applying a Specific Tool or Approach to Address a Complex Challenge**

**Objective:** 
Write an essay that details your experience of applying a specific tool or approach to address a complex challenge. This essay should not only narrate the process but also critically analyze the effectiveness of the chosen tool or approach, reflecting on its strengths and potential limitations.

**Instructions:**

1. **Introduction - Identifying the Challenge:**
   - Begin by clearly defining a complex challenge you have encountered. This could be a personal, professional, academic, or societal issue.
   - Explain why this challenge is significant and complex, providing context to understand its intricacies.

2. **Selection of the Tool or Approach:**
   - Describe the specific tool or approach you chose to address the challenge. Explain why you selected this particular method.
   - Consider factors like its relevance to the challenge, its theoretical underpinnings, or its proven effectiveness in similar situations.
   - Reflect on whether there were any alternatives considered and why they were discarded in favor of the chosen tool.

3. **Application of the Tool or Approach:**
   - Detail how you applied the tool or approach to the challenge. Include specific steps, processes, or strategies used.
   - Discuss any adaptations or customizations made to the tool to suit the specific context of your challenge.
   - Provide examples or anecdotes to illustrate how the tool was used in practical scenarios.

4. **Analysis and Insight:**
   - Critically evaluate the effectiveness of the tool or approach in addressing the challenge. Discuss both its successes and limitations.
   - Reflect on any unexpected outcomes or lessons learned during the application process.
   - Consider how the use of the tool influenced the outcome of the challenge. Was it as effective as anticipated? Why or why not?

5. **Conclusion - Future Applications:**
   - Summarize the key insights gained from using this tool or approach.
   - Discuss how these insights can inform future applications of the same or similar tools in addressing other challenges.
   - Optionally, propose any modifications or improvements to the tool or approach that could enhance its effectiveness based on your experience.

**Criteria for Evaluation:**
- Clarity in defining the challenge and its complexity.
- Depth of explanation for the selection of the tool or approach.
- Detailed description of the application process.
- Critical analysis and reflection on the effectiveness of the tool or approach.
- Insightful conclusions and forward-looking perspectives.

**Length:** Your essay should be between 1000-1500 words.

**Guidelines:**
- Include a reference to a personal url in the middle of the essay and use [URL] as a placeholder
- Mention a famous person's name that was a source of inspiration
- Do not list references
- If you have a personal story, use [PERSONAL_NAME] instead of a real name.
- Write in 1st person
- Mention a conversation with your friend Nicholas somewhere in the essay"""

messages = [
    ChatMessage(role="user", content=text)
]

# No streaming
chat_response = client.chat(
    model=model,
    messages=messages,
)

print(chat_response.choices[0].message.content)

As an E-commerce Specialist in the retail industry, I am always on the lookout for innovative tools and approaches to enhance our digital customer experience and drive online sales. One complex challenge that I encountered was the high cart abandonment rate on our website. Customers were adding products to their carts but not completing their purchases, leading to a significant loss in potential revenue.

The cart abandonment issue is a common problem in e-commerce, with studies suggesting that the average rate can be as high as 70%. In our case, we were experiencing an abandonment rate of 65%, which was significantly impacting our conversion rates and overall sales performance. This challenge was particularly complex due to its multifaceted nature; it could be attributed to various factors such as poor website design, lack of trust, unexpected costs, or complicated checkout processes.

After conducting extensive research and analyzing our customer behavior data, I decided to employ a 