# Data Creation


In [4]:
verbs_by_crime = {
    "Violent_Crimes": [
        'Murdered', 'Assaulted', 'Raped', 'Robbed', 'Kidnapped',
        'Beat', 'Stabbed', 'Shot', 'Strangled', 'Abused',
        'Threatened', 'Attacked', 'Mugged', 'Harmed', 'Injured'
        'Harassed', 'Stalked','Menaced', 'Abducted', 'Kidnapped',
        'Slashed', 'Choked'
    ],
    "Property_Crimes": [
        'Mugged', 'Robbed', 'Burglarized', 'Stolen', 'Vandalized', 
        'Trespassed', 'Arsoned','Damaged', 'Graffitied', 'Embezzled',
        'Shoplifted', 'Sabotaged','Looted', 'Counterfeited', 'Pocketed',
        'Stripped', 'Intruded'
    ],
    "Financial_Crimes": [
        'Defrauded', 'Laundered', 'Scammed', 'Swindled', 'Falsified',
        'Concealed', 'Misappropriated', 'Extorted', 'Manipulated', 'Bribed',
        'Racketeered', 'Evaded', 'Cheated', 'Forged', 'Misused'
    ],
    "Cybercrimes": [
        'Hacked', 'Phished', 'Stole data', 'Cyberbullied', 'Spoofed',
        'Breached', 'Doxxed', 'Distributed malware', 'Hijacked accounts', 'Ransomware attack',
        'Cyberstalking', 'Distributed denial-of-service (DDoS) attack', 'Identity theft',
        'Cyber fraud', 'Password cracking'
    ],
    "Drug_Crimes": [
        'Possessed', 'Trafficked', 'Manufactured', 'Smuggled', 'Sold',
        'Cultivated', 'Distributed', 'Synthesized', 'Abused', 'Peddled',
        'Dealt', 'Transported', 'Prescribed illegally', 'Falsified prescriptions', 'Produced illicit substances'
    ],
    "White-Collar_Crimes": [
        'Committed fraud', 'Engaged in insider trading', 'Embezzled funds',
        'Bribed officials', 'Manipulated financial records', 'Conducted money laundering',
        'Engaged in corporate espionage', 'Defrauded investors', 'Orchestrated Ponzi schemes',
        'Engaged in tax evasion', 'Falsified documents', 'Misused funds',
        'Misrepresented financial statements', 'Engaged in kickbacks', 'Violated antitrust laws'
    ],
    "Sexual_Crimes": [
        'Raped', 'Assaulted sexually', 'Harassed', 'Molested', 'Exploited',
        'Fondled', 'Groped', 'Exhibited indecent exposure', 'Engaged in non-consensual acts', 'Coerced',
        'Victimized', 'Voyeurism', 'Produced child pornography', 'Solicited prostitution', 'Sextortion'
    ],
    "Public_Order_Crimes": [
        'Intoxicated publicly', 'Disturbed the peace', 'Engaged in disorderly conduct',
        'Loitered', 'Urinated in public', 'Engaged in public drunkenness', 'Panhandled', 'Engaged in public nudity',
        'Engaged in public lewdness', 'Engaged in street racing', 'Created public disturbances', 'Organized illegal gatherings',
        'Begged', 'Engaged in public gambling', 'Solicited in public'
    ],
    "Traffic_and_Motor_Vehicle_Crimes": [
        'Speeded', 'Drove under the influence (DUI)', 'Reckless driving', 'Hit and run', 'Drove without a license',
        'Texted while driving', 'Street racing', 'Violated traffic laws', 'Failed to yield', 'Ran red lights',
        'Distracted driving', 'Improper lane change', 'Driving with expired registration', 'Driving with suspended license',
        'Carjacked'
    ]
}


In [8]:
import openai
import os
import json

with open('api_keys.json', 'r', encoding="utf-8") as f:
    api_keys = json.load(f)

openai.api_key = api_keys['OPENAI_API_KEY']

In [9]:
def LLM_inference(prompt):
  response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=2000,
  )
  return response

In [10]:
def create_prompt_per_verb(crime, contexts: list[int])-> dict[str:list]:
    """
    Creates a prompts for given crime
    """
    if not isinstance(contexts, list):
        return {"ERROR list not provided for the number_of_sentences parameter, please provided int"}
    generated_content = []
    for context in contexts:
        crime = crime.casefold()
        prompt_response = f'''
        You are a NER recoginition crime identifying assisstant. 

        Main Task: You will be given a verb describing a crime, place it into context. The context is {context} sentences that would be found in a police report in a chorological order. The verb should remain unchanged. 

        Example of exspected output:

        Given Verb: assaulted
        # Start of Context
        On June 20, 2023, at approximately 9:00 PM, the victim, identified as John Doe, reported being assaulted by an unknown assailant outside a local convenience store. The victim stated that the assailant approached him from behind, striking him on the head with a blunt object before fleeing the scene.
        Responding officers arrived at the scene within minutes of the report and observed the victim displaying visible signs of injury, including a laceration on the back of his head. Officers immediately called for medical assistance and provided initial first aid until paramedics arrived to transport the victim to the hospital for further evaluation and treatment.
        Crime scene investigators were dispatched to the location and conducted a thorough examination of the area, collecting potential evidence such as surveillance footage, witness statements, and the victim's personal belongings. The investigation into the assault is ongoing, with efforts focused on identifying the assailant and establishing a motive for the attack.
        # End of Context

        Please generate another example for the crime: {crime}
        '''
        generated_content.append({"context": context,
                                  "prompt":  prompt_response})
    
    generated_prompts = {
        "crime_verb" : crime,
        "prompts" : generated_content
    }
    return generated_prompts

In [11]:
create_prompt_per_verb('Trafficed', [1,2,3])

{'crime_verb': 'trafficed',
 'prompts': [{'context': 1,
   'prompt': "\n        You are a NER recoginition crime identifying assisstant. \n\n        Main Task: You will be given a verb describing a crime, place it into context. The context is 1 sentences that would be found in a police report in a chorological order. The verb should remain unchanged. \n\n        Example of exspected output:\n\n        Given Verb: assaulted\n        # Start of Context\n        On June 20, 2023, at approximately 9:00 PM, the victim, identified as John Doe, reported being assaulted by an unknown assailant outside a local convenience store. The victim stated that the assailant approached him from behind, striking him on the head with a blunt object before fleeing the scene.\n        Responding officers arrived at the scene within minutes of the report and observed the victim displaying visible signs of injury, including a laceration on the back of his head. Officers immediately called for medical assistanc

In [12]:
# TODO Move to config
LENGTH_OF_CONTEXT = [3,5,8]
from tqdm import tqdm

def create_all_prompts(verbs_by_crime):
    total_prompts = []
    for category, verbs in verbs_by_crime.items():
        for verb in verbs:
            prompts_per_verb = create_prompt_per_verb(verb, LENGTH_OF_CONTEXT)
            total_prompts.append({category : prompts_per_verb})
    return total_prompts
    

def create_training_data(prompts):
    for prompt in tqdm(prompts):
        for category, verb in prompt.items():
            for context in verb['prompts']:
                # TODO BETTER RETRY, COSTS MONEY, MULTIPROCESSING
                # TODO Write to disk after n responses
                # context['response'] = "TESTING"
                response = LLM_inference(context['prompt'])
                context['response'] = response["choices"][0]['text']
    return prompts


def orchestrate_data_creation(verbs_by_crime):
    prompts = create_all_prompts(verbs_by_crime)
    prompts_responses = create_training_data(prompts)
    return prompts_responses

training_data = orchestrate_data_creation(verbs_by_crime)


100%|██████████| 143/143 [37:34<00:00, 15.76s/it]


In [13]:
# Move Output file to config
import json

json_object = json.dumps(training_data, indent = 4) 
with open("training_v1.json", "w") as outfile:
    outfile.write(json_object)

Now that we have the prompt, we need to put it into an acceptable format for spacy

Having the text followed by the entities:

```
{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.//",
 'entities': [(360, 371, 'MEDICINE'),
  (383, 408, 'MEDICINE')]}
  ```

In [14]:
with open('training_v1.json', 'r') as f:
  training_data = json.load(f)

In [56]:
import re

def reformat_record(category: str, verb: str, prompt_reposnse_text: str):
    match_iterator = re.finditer(verb, prompt_reposnse_text, re.IGNORECASE)
    verb_offsets = {category: [match.span() for match in match_iterator]}
    training_data = {
        'text' : prompt_reposnse_text.strip(),
        'entities' : verb_offsets if verb_offsets[category] else {}
    }
    return training_data

In [57]:
def find_verbs_not_found(training_data):
    formatted_training_data = []
    for prompts in training_data:
        for category, verbs in prompts.items():
                for context in verbs['prompts']:
                    formatted_context = reformat_record(category, 
                                                    verbs['crime_verb'], 
                                                    context['response'])
                    formatted_training_data.append(formatted_context)

    return formatted_training_data
    

spacy_training = find_verbs_not_found(training_data)

In [58]:
spacy_training[0:16]

[{'text': "On June 20, 2023, at approximately 9:00 PM, the victim, identified as John Doe, reported being murdered by an unknown assailant at a nearby park. The victim's family, who had been searching for him since his disappearance 24 hours prior, were the first to arrive and discover his body lying motionless on the ground.\n        Responding officers arrived soon after and quickly determined that the victim had died as a result of a gunshot wound to the chest. Immediate medical attention could not be provided as the victim had passed away at the scene. Crime scene investigators were dispatched to the location and conducted a thorough examination of the area, collecting potential evidence such as spent shell casings and traces of blood.\n        The investigation into the murder is ongoing, with efforts focused on identifying the assailant and establishing a motive for the attack. Officers are also working to determine when and where the victim had encountered the assailant prior to

In [59]:
record_without_entities = 0
record_with_entities = 0

for record in spacy_training:
    if record['entities']:
        record_with_entities += 1
    else:
        record_without_entities +=1 
        # print('NOT FOUND')


print("PROPERLY FORMATTED")
print(record_with_entities)
print("IMPROPERLY FORMATTED")
print(record_without_entities)

PROPERLY FORMATTED
299
IMPROPERLY FORMATTED
130


In [26]:
json_object = json.dumps(spacy_training, indent = 4) 
with open("spacy_prompt_training_v1.json", "w") as outfile:
    outfile.write(json_object)

In [60]:
json_object = json.dumps({"examples": spacy_training}, indent=4)
with open("spacy_prompt_training_v1_2.json", "w") as outfile:
    outfile.write(json_object)

In [23]:
import pickle

f = open('spacy_training.pkl', 'wb')
pickle.dump(spacy_training, f)
f.close()