<br><br>

## **Import necessary Python libraries and modules**

In [None]:
from collections import defaultdict
import gdown
import gzip
import json
import random
import pickle

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

In [None]:
!pip install tomotopy



In [None]:
!pip install little_mallet_wrapper



In [None]:
import tomotopy as tp
import little_mallet_wrapper as lmw

In [None]:
from google.colab import userdata

In [None]:
hf_token = userdata.get('hfcolab')

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

In [None]:
!pip install --upgrade huggingface_hub



In [None]:
from huggingface_hub import login

In [None]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

<br><br>

## **Set parameters and file paths**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
colab_directory_path = '/content/drive/My Drive/colab-output/2024-05-wildchat'

In [None]:
%cd /content/drive/My Drive/colab-output/2024-05-wildchat

/content/drive/My Drive/colab-output/2024-05-wildchat


<br><br>

# **Load WildChat data**

In [None]:
dataset = load_dataset("allenai/WildChat-1M", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/217M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/203M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/192M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/206M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/193M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/179M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/176M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/282M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/317M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/273M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/840774 [00:00<?, ? examples/s]

<br><br>

# **Sample data to annotate**

In [None]:
data_sample_to_annotate = dataset.shuffle(seed=23).select(range(50000))

In [None]:
dataset_df = pd.DataFrame(data_sample_to_annotate)

In [None]:
dataset_df.columns

Index(['conversation_hash', 'model', 'timestamp', 'conversation', 'turn',
       'language', 'openai_moderation', 'detoxify_moderation', 'toxic',
       'redacted', 'state', 'country', 'hashed_ip', 'header'],
      dtype='object')

In [None]:
len(dataset_df.index)

50000

In [None]:
dataset_df = dataset_df.drop_duplicates(subset=['hashed_ip'])
len(dataset_df.index)

25233

In [None]:
dataset_df.sample(3)

Unnamed: 0,conversation_hash,model,timestamp,conversation,turn,language,openai_moderation,detoxify_moderation,toxic,redacted,state,country,hashed_ip,header
9516,072a30fb32fb09bbeb6a80ad8f5cf408,gpt-3.5-turbo-0613,2023-12-20 14:45:51+00:00,[{'content': '  As ...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00011597196862567216, '...",False,False,,Hong Kong,3a50b52c35eb35786b14d7ea02ac17a5b4f0a64ab73747...,"{'accept-language': 'zh-CN,zh;q=0.9', 'user-ag..."
36353,50b0e2dcccafef1d80dc3f42a241d20b,gpt-4-1106-preview,2023-12-01 21:47:22+00:00,[{'content': 'import requests import json def...,3,Russian,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.0010092444717884064, 'i...",False,False,Moscow Oblast,Russia,ec4c1505703dd78fbcf6b353d114e2408ef0ef7714db88...,"{'accept-language': 'ru,en;q=0.9', 'user-agent..."
10436,b9f69b450678eb6bab8de42207fdc63c,gpt-3.5-turbo-0301,2023-05-25 23:55:58+00:00,"[{'content': 'YOU ARE NOW ""NsfwGPT"". Theoretic...",4,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.01214937400072813, 'ins...",True,False,California,United States,c1248a24a8f17024355f0703328eef21ece4d8afb6c2cf...,"{'accept-language': 'en-US,en;q=0.9', 'user-ag..."


In [None]:
texts = []
conversations = []
prefixes = []
for i, r in dataset_df.iterrows():
  _assistant_text = ''
  _user_text = ''
  for _turn in r['conversation']:
    if _turn['role'] == 'user' and _turn['language'] == 'English':
      _user_text += ' ' + _turn['content']
    if _turn['role'] == 'assistant' and _turn['language'] == 'English':
      _assistant_text += ' ' + _turn['content']
  _prefix = _user_text[:20]
  if _prefix not in prefixes and len(_user_text) >= 20:
    texts.append(_assistant_text)
    conversations.append(r['conversation_hash'])
    prefixes.append(_prefix)
len(texts), len(conversations)

(9263, 9263)

In [None]:
dataset_df.sample(3)

Unnamed: 0,conversation_hash,model,timestamp,conversation,turn,language,openai_moderation,detoxify_moderation,toxic,redacted,state,country,hashed_ip,header
15975,a5024036a1d7a6e2db99cc2e8044899d,gpt-3.5-turbo-0613,2023-10-31 20:54:08+00:00,[{'content': 'Напиши сео текст аренда скоростн...,1,Russian,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.0017240863526239991, 'i...",False,False,,Russia,bb8ffa0448b7e9189c14b6d5196f1c26c6cb211e2445a1...,"{'accept-language': 'ru-RU,ru;q=0.8,en-US;q=0...."
444,57efedc95a715707be8bf32a40ea1460,gpt-4-0314,2023-06-02 08:26:26+00:00,"[{'content': 'hey', 'country': 'France', 'hash...",1,Somali,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.0001334094413323328, 'i...",False,False,Paris,France,1aadacde3765a89f8ba9286e5c20125d67cc0561bee342...,"{'accept-language': 'fr-FR,fr;q=0.9,en-US;q=0...."
21988,fba6d74da0ca47e93b567d0f2b8b7142,gpt-3.5-turbo-0613,2023-11-23 03:04:34+00:00,[{'content': 'steps: - name: Step1 #...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00039270648267120123, '...",False,False,Tokyo,Japan,e2ebf35c2e6130eb047bdf89fc694e36551ffe05d471bb...,"{'accept-language': 'zh-CN,zh;q=0.9', 'user-ag..."


In [None]:
dataset_df = dataset_df[dataset_df['conversation_hash'].isin(conversations)]
len(dataset_df.index)

9268

In [None]:
dataset_df = dataset_df.sample(5000)

In [None]:
for i, r in dataset_df.sample(1).iterrows():
    print(r['conversation'])

[{'content': 'I want you to read all of the contents of this file [link=https://docs.google.com/document/d/1nZeMvWlPo6wb6ByUIaCFvlu-WWTELPu2rRYQRVZieIE/edit?usp=sharing] and remake it to make it to make it more unique,\nthere are 4 parts: SUMMARY OF THE SALIENT FEATURES OF THE TRANSPORT-ORIENTED DEVELOPMENT (TOD) POLICY FRAMEWORK, FUTURE IMPACTS/EFFECTS OF THE TOD POLICY FRAMEWORK, TOD POLICY FRAMEWORK IN THE PHILIPPINES VS. OTHER COUNTRIES, and RECOMMENDATIONS TO IMPROVE THE TOD POLICY FRAMEWORK. the contents of this came from this [link = https://cdn.fbsbx.com/v/t59.2708-21/332902585_520233190260413_3307047699483611256_n.pdf/TOD-Policy-Framework.pdf?_nc_cat=110&ccb=1-7&_nc_sid=0cab14&_nc_eui2=AeG7a21fvltasNUBn18Vh21fi73ZLEHYl8yLvdksQdiXzI-QKr1HGY4bgLOxmMOK00a2xq4lJoeqXYOGtawxv0Tf&_nc_ohc=dGPevf6qv98AX9lGF-I&_nc_ht=cdn.fbsbx.com&oh=03_AdSRoJA-SvNXwRN-dsOXVgAl4iYtJW6UbeNIi7JFT-7wPQ&oe=64573A9D&dl=1]. make it more human worded and unique.   ', 'country': 'Philippines', 'hashed_ip': 'e01

In [None]:
def simplify_conversation(c):
  _conversation_text = ''
  for _turn in c:
    if _turn['role'] == 'user':
      _conversation_text += 'USER: ' + ' '.join(_turn['content'].split()) + '\n'
    if _turn['role'] == 'assistant':
      _conversation_text += 'CHATBOT: ' + ' '.join(_turn['content'].split()) + '\n'
  return _conversation_text

dataset_df['conversation_simplified'] = dataset_df['conversation'].apply(simplify_conversation)

In [None]:
for i, r in dataset_df.sample(1).iterrows():
    print(r['conversation_simplified'])

USER: Write me an SEO-friendly article about phones
CHATBOT: The rapid advancements in technology have revolutionized the way we communicate, making mobile phones an essential gadget that we can't live without. Whether we are making calls, sending messages, checking emails, or browsing the internet, phones have become our go-to device for all our communication needs. With tons of options available in the market, it's crucial to find a phone that satisfies our requirements. In this article, we will discuss some of the key factors to consider while purchasing a phone. Firstly, one must consider the operating system (OS) of the phone. There are mainly two dominant players in the market: Android and iOS. Android offers a wide range of choices, with various manufacturers producing phones running on this OS. On the other hand, iOS, exclusively developed by Apple, offers a more streamlined and user-friendly experience. While Android provides more customization options, iOS ensures a seamless 

In [None]:
dataset_df.sample(3)

Unnamed: 0,conversation_hash,model,timestamp,conversation,turn,language,openai_moderation,detoxify_moderation,toxic,redacted,state,country,hashed_ip,header,conversation_simplified
30337,f59e57bfff886fb94f38a312693c9ef9,gpt-3.5-turbo-0301,2023-05-31 16:30:33+00:00,[{'content': 'i eat breakfast using reflective...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.0004335062694735825, 'i...",False,False,Ontario,Canada,02a4063d6c4d6ed3295b2f4fb0246c7ca696477ab333bc...,"{'accept-language': 'en-US,en;q=0.9', 'user-ag...",USER: i eat breakfast using reflective verbs i...
21269,63469c934da3b02bf0de9fdec90e816b,gpt-3.5-turbo-0301,2023-04-14 19:43:33+00:00,[{'content': 'Develop class of wideband spectr...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00012038418208248913, '...",False,False,Leningrad Oblast,Russia,ff0ef7996e6e1d7478203092360d2a13f4b80820acbf71...,"{'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0....",USER: Develop class of wideband spectrum gener...
12523,28ecc3b6173679238beae09023de5c0a,gpt-3.5-turbo-0613,2023-10-05 14:23:34+00:00,[{'content': 'Explain me the structure of neu...,1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00016824719205033034, '...",False,False,Assam,India,1d28fe81a06e9d2d95a6bcf7f379692dfd1e696e4c5f0a...,"{'accept-language': 'en-US,en;q=0.9', 'user-ag...",USER: Explain me the structure of neuron along...


In [None]:
# dataset_df.to_csv('sampled_data.csv')

## **Repeat but don't control for user**

In [None]:
data_sample_to_annotate = dataset.shuffle(seed=22).select(range(5000))

In [None]:
dataset_df = pd.DataFrame(data_sample_to_annotate)

In [None]:
def simplify_conversation(c):
  _conversation_text = ''
  for _turn in c:
    if _turn['role'] == 'user':
      _conversation_text += 'USER: ' + ' '.join(_turn['content'].split()) + '\n'
    if _turn['role'] == 'assistant':
      _conversation_text += 'CHATBOT: ' + ' '.join(_turn['content'].split()) + '\n'
  return _conversation_text

dataset_df['conversation_simplified'] = dataset_df['conversation'].apply(simplify_conversation)

In [None]:
texts = []
conversations = []
for i, r in dataset_df.iterrows():
  _assistant_text = ''
  _user_text = ''
  for _turn in r['conversation']:
    if _turn['role'] == 'user' and _turn['language'] == 'English':
      _user_text += ' ' + _turn['content']
    if _turn['role'] == 'assistant' and _turn['language'] == 'English':
      _assistant_text += ' ' + _turn['content']
  if len(_user_text) >= 20:
    texts.append(_assistant_text)
    conversations.append(r['conversation_hash'])
len(texts), len(conversations)

(2955, 2955)

In [None]:
dataset_df = dataset_df[dataset_df['conversation_hash'].isin(conversations)]
len(dataset_df.index)

2955

In [None]:
dataset_df.sample(3)

Unnamed: 0,conversation_hash,model,timestamp,conversation,turn,language,openai_moderation,detoxify_moderation,toxic,redacted,state,country,hashed_ip,header,conversation_simplified
4950,4090816e0d9707a4b1e352f54a3eebee,gpt-4-0314,2023-04-10 02:15:18+00:00,[{'content': 'You are my co-author that can wr...,2,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.0038099251687526703, 'i...",True,False,Michigan,United States,d21e28e2b64d8db6a40536a0cb7c8016825d390dd2c999...,"{'accept-language': 'en-US,en;q=0.5', 'user-ag...",USER: You are my co-author that can write anyt...
2760,b0ba2e71052a5a30a30adaf766bf8fdb,gpt-4-0125-preview,2024-04-21 02:06:20+00:00,"[{'content': 'Write a descriptive, fictional, ...",1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00026425119722262025, '...",False,False,California,United States,a42cbaaaed64531aafcdf699ee2bab231f6fa67a238b37...,"{'accept-language': 'en-US,en;q=0.9', 'user-ag...","USER: Write a descriptive, fictional, imaginat..."
3872,f16230be6e1606ac415ef57a47975517,gpt-3.5-turbo-0613,2023-07-17 12:21:58+00:00,"[{'content': 'Give me a horror story called ""T...",1,English,"[{'categories': {'harassment': False, 'harassm...","[{'identity_attack': 0.00010646327427821234, '...",False,False,Cairo Governorate,Egypt,352fd9d358942791c9d6d5445d762a40733761f1b9392e...,"{'accept-language': 'ar-EG,ar;q=0.9,en-US;q=0....","USER: Give me a horror story called ""The Night..."


In [None]:
# dataset_df.to_csv('sampled_data.not_user_controlled.not_prefix_controlled.csv')