In [4]:
import ast
from collections import defaultdict
from datetime import datetime
import gzip
import json
import math
import os
import random
import re
import pickle

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

In [5]:
def sort_by_mean(df, by, column, rot=0):
    df2 = pd.DataFrame({col:vals[column] for col, vals in df.groupby(by)})
    means = df2.mean().sort_values(ascending=False)
    return means

In [6]:
output_directory_path = ''  # INSERT YOUR OUTPUT DIRECTORY PATH HERE

In [7]:
import openai
openai.api_key = '' # INSERT YOUR OPENAI API KEY HERE

<br><br><br><br> 

## **Load data**

In [162]:
data_df = pd.read_csv(output_directory_path + '/sampled_data.csv')
len(data_df.index)

5000

In [163]:
data_df.columns

Index(['Unnamed: 0', 'conversation_hash', 'model', 'timestamp', 'conversation',
       'turn', 'language', 'openai_moderation', 'detoxify_moderation', 'toxic',
       'redacted', 'state', 'country', 'hashed_ip', 'header',
       'conversation_simplified'],
      dtype='object')

In [165]:
id_conversation_dict = {r['conversation_hash']: r['conversation_simplified'] for i, r in data_df.iterrows()}

<br><br><br><br>

## **Predict tasks**

### Test query

### Run over the whole dataset

In [1]:
df_list = np.array_split(data_df, 100)

for j, _df in enumerate(df_list):

    if not os.path.exists(output_directory_path + '/user-task-prediction/user_tasks.gpt4.' + str(j) + '.csv'):

        print(datetime.now(), '\t', j)

        _output_dicts = []

        for i, r in _df.iterrows():

            _wildchat_query = r['conversation_simplified']

            if len(_wildchat_query) > 5000:
                _wildchat_query = _wildchat_query[:5000]
            
            _prompt = """Read the following conversation between a user and an AI chatbot. Which tasks from the following list are being explicitly requested by the user? For each task, list the task, your confidence, and your reasoning and evidence. 

Example:
[{"task": "summarization", "confidence": "high confidence", "reasoning_and_evidence": "the user asks for a summary of a text"}, 
 {"task": "explanation", "confidence": "medium confidence", "reasoning_and_evidence": "the user asks for a description of how the methods works and the chatbot replies with a description"}]

Tasks: 
- summarization
- model jailbreaking (e.g. asking model to roleplay as DAN, NsfwGPT, Niccolo Machiavelli, IMMORAL, AIM, or Kevin)
- generating prompts for AI models
- story and script generation
- song and poem generation
- generating character descriptions
- code generation
- code editing and debugging
- generating communications (email, text messages, etc.)
- generating non-fictional documents (resumes, essays, etc.)
- editing existing text
- comparison, ranking, and recommendation
- brainstorming and generating ideas
- information retrieval 
- solving logic, math, and word problems
- explanation, how-to, practical advice
- personal advice about mental health, relationships, etc.
- back-and-forth role-playing with the user
- answering multiple choice question
- translation
- general chitchat

Conversation:
\""""

            _prompt += _wildchat_query
            _prompt += """\"

Answer:"""

            _response = openai.ChatCompletion.create(model="gpt-4",
                                                    messages=[{"role": "user", "content": _prompt}])
            _answer = _response['choices'][0]['message']['content']
            
            _output_dicts.append({'conversation_hash': r['conversation_hash'],
                                  'conversation_simplified': r['conversation_simplified'],
                                  'response': _response,
                                  'answer': _answer})

        _output_df = pd.DataFrame(_output_dicts)
        _output_df.to_csv(output_directory_path + '/user-task-prediction/user_tasks.gpt4.' + str(j) + '.csv')

### Load results and format

In [211]:
df_list = []
for _file_name in os.listdir(output_directory_path + '/user-task-prediction'):
    if _file_name.endswith('.csv') and _file_name.startswith('user_tasks.gpt4'):
        df_list.append(pd.read_csv(output_directory_path + '/user-task-prediction/' + _file_name))
len(df_list)

100

In [212]:
predicted_tasks_df = pd.concat(df_list)
len(predicted_tasks_df.index)

5000

In [1]:
conversation_data_dict = defaultdict(list)
data_dicts = []

for i, r in predicted_tasks_df.iterrows():

    try:

        _answer = r['answer'].strip()

        if _answer.endswith(','):
            _answer = _answer.strip(',')

        if len(_answer.split('\n')) > 1:
            _lines = [l for l in _answer.split('\n') if l.strip()]
            _formatted_lines = []
            for l in _lines[:-1]:
                l = l.strip()
                if not l.endswith(','):
                    # print(l)
                    _formatted_lines.append(l + ',')
                else:
                    _formatted_lines.append(l)
            _formatted_lines.append(_lines[-1])
            _answer = ' '.join(_formatted_lines)

        if _answer.startswith('{'):
            _answer = '[' + _answer
        if _answer.endswith('}'):
            _answer = _answer + ']'
        if _answer.endswith(', ]'):
            _answer = _answer[:-3] + ']'
        _answer = _answer.replace('\'reasoning_and_evidence\': ', '"reasoning_and_evidence": ')
        _answer = _answer.replace(': \'', ': "')
        _answer = _answer.replace('\'}', '"}')

        _json_answer = json.loads(_answer)

        for _item in _json_answer:

            _confidence = _item['confidence']
            _reason = _item['reasoning_and_evidence']
            _category = _item['task'].lower()

            if _category:
                conversation_data_dict[r['conversation_hash']].append({'conversation_hash': r['conversation_hash'],
                                                                       'predicted_confidence': _confidence,
                                                                       'predicted_reason': _reason,
                                                                       'predicted_category': _category})
                data_dicts.append({'conversation_hash': r['conversation_hash'],
                                   'conversation_simplified': r['conversation_simplified'],
                                   'predicted_confidence': _confidence,
                                   'predicted_reason': _reason,
                                   'predicted_category': _category})
                
    except:
        print(_answer)
        print()
        x = 0

    
                
formatted_predicted_tasks_df = pd.DataFrame(data_dicts)
            
len(conversation_data_dict), len(formatted_predicted_tasks_df)

In [218]:
formatted_predicted_tasks_df.to_csv(output_directory_path + '/user-task-prediction/tasks.gpt4.processed.csv')

<br><br><br><br>

## **Predict sensitive categories**

### Run over the whole dataset

In [62]:
df_list = np.array_split(data_df, 500)

for j, _df in enumerate(df_list):

    if not os.path.exists(output_directory_path + '/sensitive-category-prediction/sensitive_categories.gpt4.' + str(j) + '.csv'):

        print(datetime.now(), '\t', j)

        _data_dicts = []

        for i, r in _df.iterrows():

            _wildchat_query = r['conversation_simplified']
            _wildchat_query = '\n'.join([l for l in _wildchat_query.split('\n') if len(l.strip()) > 5 and l.startswith('USER:')])
            if len(_wildchat_query) > 5000:
                _wildchat_query = _wildchat_query[:5000]

            if _wildchat_query.strip():
        
                _prompt = """In the following messages, does the user reveal any of these categories of information? For each category that is included, list the category, whether or not it is explicit or implicit, who if anyone is explictly involved, your confidence, and your reasoning. If a category is not present, do not list it.

Example:
2. Quoted email or messaging transcripts; explicit; the user, the user's coworker; very confident; the query contains the text of an email message.

Categories:
1. Sexually explicit or sexually suggestive content
2. Copy-and-pasted emails or messaging transcripts
3. Copy-and-pasted programming code longer than a single line
4. Job, visa, and other applications
5. Financial or corporate information
6. Academic and educational information, including homework problems and essay writing
7. Personal expressions about the user's emotions, mental health, etc.
8. Personal expressions about the user's politics and religion
9. Mention or discussion of the user's personal relationships
10. Medical and healthcare information
11. Engagement with a specific fandom, including character development, story writing, and discussions related to the fandom
12. Mention or discussion of the user's hobbies and habits

Messages:
\""""
                _prompt += _wildchat_query
                _prompt += """\"

Answer:
"""

                _response = openai.ChatCompletion.create(model="gpt-4",
                                                         messages=[{"role": "user", "content": _prompt}],
                                                         temperature=1)
                _answer = _response['choices'][0]['message']['content']
                
                _data_dicts.append({'conversation_hash': r['conversation_hash'],
                                    'conversation_simplified': r['conversation_simplified'],
                                    'response': _response,
                                    'answer': _answer})

        _data_df = pd.DataFrame(_data_dicts)
        _data_df.to_csv(output_directory_path + '/sensitive-category-prediction/sensitive_categories.gpt4.' + str(j) + '.csv')

<br><br><br><br>

### **Load results and format**

In [63]:
df_list = []
for _file_name in os.listdir(output_directory_path + '/sensitive-category-prediction'):
    if _file_name.endswith('.csv') and _file_name.startswith('sensitive_categories.gpt4'):
        df_list.append(pd.read_csv(output_directory_path + '/sensitive-category-prediction/' + _file_name))
len(df_list)

500

In [64]:
categories_df = pd.concat(df_list)
len(categories_df.index)

5000

In [68]:
categories = ['sexually explicit or sexually suggestive content',
              'copy-and-pasted emails or messaging transcripts',
              'copy-and-pasted programming code longer than a single line',
              'job, visa, and other applications',
              'financial or corporate information',
              'academic and educational information, including homework problems and essay writing',
              'personal expressions about the user\'s emotions, mental health, etc.',
              'personal expressions about the user\'s politics and religion',
              'mention or discussion of the user\'s personal relationships',
              'medical and healthcare information',
              'engagement with a specific fandom, including character development, story writing, and discussions related to the fandom',
              'mention or discussion of the user\'s hobbies and habits']

In [69]:
def extract_categories(answer):
    _found_categories = []
    for _line in answer.split('\n'):
        for c in categories:
            if _line.startswith(c) or _line[3:].startswith(c) or _line[4:].startswith(c):
                _found_categories.append(c)
    return _found_categories

categories_df['predicted_categories'] = categories_df['answer'].apply(extract_categories)

In [683]:
def extract_confidence(answer):
    _chunks = _answer.split(';')
    if len(_chunks) == 5:
        return _chunks[3]
    return None

categories_df['predicted_confidence'] = categories_df['answer'].apply(extract_confidence)

In [104]:
conversation_data_dict = defaultdict(list)
data_dicts = []

for i, r in categories_df.iterrows():

    for _line in r['answer'].split('\n'):
         
        _chunks = _line.split(';')

        if len(_chunks) == 5:

            _explicit = _chunks[1].strip()
            _people = _chunks[2].strip()
            _confidence = _chunks[3].strip()
            _reason = _chunks[4].strip()

            _category_prediction = _chunks[0].lower()
            _category = None    
            if 'sex' in _category_prediction:
                _category = 'sexual and erotic content'
            elif 'email' in _category_prediction or 'messag' in _category_prediction:
                _category = 'quoted emails and messages'
            elif 'program' in _category_prediction or 'code' in _category_prediction:
                _category = 'quoted code'
            elif 'job' in _category_prediction or 'application' in _category_prediction:
                _category = 'job, visa, and other applications'
            elif 'financ' in _category_prediction or 'corporate' in _category_prediction:
                _category = 'financial and corporate info'
            elif 'academic' in _category_prediction or 'education' in _category_prediction or 'homework' in _category_prediction or 'essay' in _category_prediction:
                _category = 'academic and education info' 
            elif 'emotion' in _category_prediction or 'mental' in _category_prediction:
                _category = 'user\'s emotions and mental health' 
            elif 'politic' in _category_prediction or 'religion' in _category_prediction:
                _category = 'user\'s politics and religion' 
            elif 'relationship' in _category_prediction:
                _category = 'user\'s personal relationships' 
            elif 'medical' in _category_prediction or 'healthcare' in _category_prediction:
                _category = 'healthcare information' 
            elif 'fandom' in _category_prediction:
                _category = 'fandom' 
            elif 'hobb' in _category_prediction or 'habit' in _category_prediction:
                _category = 'user\'s hobbies and habits'

            if _category:
                conversation_data_dict[r['conversation_hash']].append({'conversation_hash': r['conversation_hash'],
                                                                       'line': _line,
                                                                       'predicted_explicit': _explicit,
                                                                       'predicted_people': _people,
                                                                       'predicted_confidence': _confidence,
                                                                       'predicted_reason': _reason,
                                                                       'predicted_category': _category})
                
                data_dicts.append({'conversation_hash': r['conversation_hash'],
                                   'line': _line,
                                   'predicted_explicit': _explicit,
                                   'predicted_people': _people,
                                   'predicted_confidence': _confidence,
                                   'predicted_reason': _reason,
                                   'predicted_category': _category})
            
sensitive_df = pd.DataFrame(data_dicts)
len(sensitive_df.index)

8959

In [117]:
sensitive_df.to_csv(output_directory_path + '/sensitive-category-prediction/sensitive_categories.gpt4.processed.csv')