# Install required library/package

In [None]:
import os
import pandas as pd


# Download datasets

In [None]:
# Download the datasets: 
#   1. Webtext (the real one)
#   2. xl-1542M-k40 which is generated by GPT-2 model using the webtext as train set
# Copy right: https://github.com/openai/gpt-2-output-dataset

import os
import sys
import requests
from tqdm import tqdm

subdir = 'data'
if not os.path.exists(subdir):
    os.makedirs(subdir)
subdir = subdir.replace('\\','/') # needed for Windows

for ds in ['webtext', 'xl-1542M-k40']:
    for split in ['train', 'valid', 'test']:
        filename = ds + "." + split + '.jsonl'
        r = requests.get("https://storage.googleapis.com/gpt-2/output-dataset/v1/" + filename, stream=True)

        with open(os.path.join(subdir, filename), 'wb') as f:
            file_size = int(r.headers["content-length"])
            chunk_size = 1000
            with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    pbar.update(chunk_size)

Fetching webtext.train.jsonl: 679Mit [00:09, 69.0Mit/s]                                             
Fetching webtext.valid.jsonl: 13.6Mit [00:00, 56.4Mit/s]                                            
Fetching webtext.test.jsonl: 13.5Mit [00:00, 69.2Mit/s]                                             
Fetching xl-1542M-k40.train.jsonl: 748Mit [00:10, 71.1Mit/s]                                        
Fetching xl-1542M-k40.valid.jsonl: 15.0Mit [00:00, 68.8Mit/s]                                       
Fetching xl-1542M-k40.test.jsonl: 14.6Mit [00:00, 70.1Mit/s]                                        


# Read the jsonl files

There are 6 datasets: 
  1. webtext.train
  2. webtext.valid
  3. webtext.test
  4. xl-1542M-k40.train
  5. xl-1542M-k40.valid
  6. xl-1542M-k40.test

All the dataset will be read.

For webtext, we combine all files of train, valid and test set into one set and do the same thing for xl-1542M-k40.

In [None]:
# Read the jsonl files 

def read_json_file (path, filename):
  filetype_list = ['train', 'valid', 'test']

  # Read the jsonl files and convert them to csv files
  df_all = []
  for file_type in filetype_list:
    df = pd.read_json (path + filename +'.' + file_type +'.jsonl', lines=True)
    df['type'] = file_type  # Add another column filled with the type of data 
    df['filename'] = filename # Add another column filled with the data's name
    df_all.append(df)

  return df_all



path = '/content/data/'

# Get a list of train, valid and test dataframe for webtext file
df_webtext_list = read_json_file(path, 'webtext')

# Get a list of train, valid and test dataframe for xl-1542M-k40 file
df_xl_list = read_json_file(path, 'xl-1542M-k40')

# Combine each list as a dataframe
df_webtext = pd.concat(df_webtext_list, ignore_index = True)
df_xl = pd.concat(df_xl_list, ignore_index = True)

In [None]:
# View the webtext dataframe
print(df_webtext.shape)
df_webtext

(260000, 6)


Unnamed: 0,id,ended,length,text,type,filename
0,0,True,138,These girlfriends deserves a special mention f...,train,webtext
1,1,True,66,LeSean McCoy going through warmups with first ...,train,webtext
2,2,False,1024,Tom Curran has been called up to England's Ash...,train,webtext
3,3,False,1024,"We'll have turkey on the table Thursday but, a...",train,webtext
4,4,False,1024,The 1945 Sinkings of the Cap Arcona and the Th...,train,webtext
...,...,...,...,...,...,...
259995,259995,False,1024,"The week after my 30th birthday, my best frien...",test,webtext
259996,259996,True,300,Downton Abbey doesn't return for its third sea...,test,webtext
259997,259997,True,360,Amazon Studios said today that Stephen Root ha...,test,webtext
259998,259998,True,607,Everyone likes clean wheels but not everyone h...,test,webtext


In [None]:
# View the xl dataframe
print(df_xl.shape)
df_xl

(260000, 6)


Unnamed: 0,id,text,length,ended,type,filename
0,1,"Cops will have to take ""extreme care"" to avoid...",433,True,train,xl-1542M-k40
1,2,The latest edition of the German Football Hall...,715,True,train,xl-1542M-k40
2,3,"Dangerous animals, especially snakes, in a car...",1024,False,train,xl-1542M-k40
3,4,I started my first MLP story way back in 2014 ...,1024,False,train,xl-1542M-k40
4,5,This article is a disambiguation page for The ...,84,True,train,xl-1542M-k40
...,...,...,...,...,...,...
259995,259996,How to make your life with your little ones ea...,1024,False,test,xl-1542M-k40
259996,259997,This isn't the first time I see this particula...,136,True,test,xl-1542M-k40
259997,259998,"AUSTIN, Texas – The NCAA on Friday approved th...",917,True,test,xl-1542M-k40
259998,259999,"In the previous blog post, I discussed the gen...",689,True,test,xl-1542M-k40


In [None]:
# Check for missing value or null values
print("Checking missing value for webtext dataframe: ")
print(df_webtext.isnull().sum())
print("\nChecking missing value for xl dataframe: ")
print(df_xl.isnull().sum())

Checking missing value for webtext dataframe: 
id          0
ended       0
length      0
text        0
type        0
filename    0
dtype: int64

Checking missing value for xl dataframe: 
id          0
text        0
length      0
ended       0
type        0
filename    0
dtype: int64


# Randomly choose 50,000 rows from each dataframe

In [None]:
# Randomly choose 50,000 rows from each dataframe
# Ref: https://www.geeksforgeeks.org/how-to-randomly-select-rows-from-pandas-dataframe/

df_webtext_chosen = df_webtext.sample(n = 50000) 
df_xl_chosen = df_xl.sample(n = 50000) 

# check how many rows are form train, valid, test set for each dataframe
print("webtext dataframe: ")
print(pd.value_counts(df_webtext_chosen['type']))
print("\nxl dataframe: ")
print(pd.value_counts(df_xl_chosen['type']))

webtext dataframe: 
train    48052
test      1010
valid      938
Name: type, dtype: int64

xl dataframe: 
train    48088
test       971
valid      941
Name: type, dtype: int64


In [None]:
# View the webtext dataframe
print(df_webtext_chosen.shape)
df_webtext_chosen

(50000, 6)


Unnamed: 0,id,ended,length,text,type,filename
47018,47018,True,102,The overarching quality of the Bloomberg era w...,train,webtext
94338,94338,True,142,This is about as bad as my morning commute get...,train,webtext
44507,44507,True,631,More on Saskatchewan Liquor Privatization\n\nM...,train,webtext
163493,163493,False,1024,Killen was here\n\nPatrick Killen helped defin...,train,webtext
91925,91925,True,51,Mailbox Rental & Mail Forwarding\n\nServices i...,train,webtext
...,...,...,...,...,...,...
209661,209661,False,1024,"Language English\n\n""Only the small secrets ne...",train,webtext
254841,254841,False,1024,"Real-time strategy games are, in a way, games ...",valid,webtext
107963,107963,False,1024,Noted tuner Shane Tecklenburg has his hands in...,train,webtext
108519,108519,True,671,Wayne Rooney is hoping to be back against Ever...,train,webtext


In [None]:
# View the xl dataframe
print(df_xl_chosen.shape)
df_xl_chosen

(50000, 6)


Unnamed: 0,id,text,length,ended,type,filename
90486,90487,"We don't know what the ""truth"" really is, sinc...",187,True,train,xl-1542M-k40
161070,161071,An image of a woman and a child in her home wa...,441,True,train,xl-1542M-k40
81100,81101,The U.S. Food and Drug Administration on Thurs...,797,True,train,xl-1542M-k40
133634,133635,The world is still waiting for the next Ninten...,321,True,train,xl-1542M-k40
44144,44145,You have successfully requested this file from...,18,True,train,xl-1542M-k40
...,...,...,...,...,...,...
88657,88658,The official website for Donten ni Warau TV 's...,548,True,train,xl-1542M-k40
7574,7575,"I have a long, slow, and somewhat unproductive...",803,True,train,xl-1542M-k40
38427,38428,"""Flexibility Is Not an Option""\n\nAt The Nouri...",1024,False,train,xl-1542M-k40
129004,129005,\nDuck Dynasty's Phil Robertson will have to m...,137,True,train,xl-1542M-k40


# Combine the two datasets

In [None]:
# Combine the two datasets

df_combine = pd.concat([df_webtext_chosen, df_xl_chosen], ignore_index = True)
print(df_combine.shape)
df_combine

(100000, 6)


Unnamed: 0,id,ended,length,text,type,filename
0,47018,True,102,The overarching quality of the Bloomberg era w...,train,webtext
1,94338,True,142,This is about as bad as my morning commute get...,train,webtext
2,44507,True,631,More on Saskatchewan Liquor Privatization\n\nM...,train,webtext
3,163493,False,1024,Killen was here\n\nPatrick Killen helped defin...,train,webtext
4,91925,True,51,Mailbox Rental & Mail Forwarding\n\nServices i...,train,webtext
...,...,...,...,...,...,...
99995,88658,True,548,The official website for Donten ni Warau TV 's...,train,xl-1542M-k40
99996,7575,True,803,"I have a long, slow, and somewhat unproductive...",train,xl-1542M-k40
99997,38428,False,1024,"""Flexibility Is Not an Option""\n\nAt The Nouri...",train,xl-1542M-k40
99998,129005,True,137,\nDuck Dynasty's Phil Robertson will have to m...,train,xl-1542M-k40


In [None]:
# Check for missing value or null values
print(df_combine.isnull().sum())

id          0
ended       0
length      0
text        0
type        0
filename    0
dtype: int64


In [None]:
# Count unique id
pd.value_counts(df_combine['id'])

2049      2
214240    2
249009    2
97538     2
221051    2
         ..
242655    1
31432     1
154594    1
21475     1
120859    1
Name: id, Length: 90305, dtype: int64

# Adjust the dataframe

In [None]:
# Add another column to label whether each rows are real or are generated by GPT-2
df_combine['label'] = df_combine['filename'].map({'webtext': 'real', 'xl-1542M-k40': 'GPT-2'})

In [None]:
print(df_combine.shape)
df_combine

(100000, 7)


Unnamed: 0,id,ended,length,text,type,filename,label
0,47018,True,102,The overarching quality of the Bloomberg era w...,train,webtext,real
1,94338,True,142,This is about as bad as my morning commute get...,train,webtext,real
2,44507,True,631,More on Saskatchewan Liquor Privatization\n\nM...,train,webtext,real
3,163493,False,1024,Killen was here\n\nPatrick Killen helped defin...,train,webtext,real
4,91925,True,51,Mailbox Rental & Mail Forwarding\n\nServices i...,train,webtext,real
...,...,...,...,...,...,...,...
99995,88658,True,548,The official website for Donten ni Warau TV 's...,train,xl-1542M-k40,GPT-2
99996,7575,True,803,"I have a long, slow, and somewhat unproductive...",train,xl-1542M-k40,GPT-2
99997,38428,False,1024,"""Flexibility Is Not an Option""\n\nAt The Nouri...",train,xl-1542M-k40,GPT-2
99998,129005,True,137,\nDuck Dynasty's Phil Robertson will have to m...,train,xl-1542M-k40,GPT-2


# Write the two datasets into a csv file for later use

In [None]:
# Write the new dataset into a file for later use
# Ref: https://stackoverflow.com/questions/51860716/how-save-a-array-to-text-file-in-python
file_name = "webtext_gpt2.csv"
df_combine_write = df_combine.copy()

df_combine_write = df_combine_write.drop('filename', 1) #where 1 is the axis number (0 for rows and 1 for columns.)

df_combine_write.to_csv(file_name, sep='\t', encoding='utf-8', index=False) 

In [None]:
print(df_combine_write.shape)
df_combine_write

(100000, 6)


Unnamed: 0,id,ended,length,text,type,label
0,47018,True,102,The overarching quality of the Bloomberg era w...,train,real
1,94338,True,142,This is about as bad as my morning commute get...,train,real
2,44507,True,631,More on Saskatchewan Liquor Privatization\n\nM...,train,real
3,163493,False,1024,Killen was here\n\nPatrick Killen helped defin...,train,real
4,91925,True,51,Mailbox Rental & Mail Forwarding\n\nServices i...,train,real
...,...,...,...,...,...,...
99995,88658,True,548,The official website for Donten ni Warau TV 's...,train,GPT-2
99996,7575,True,803,"I have a long, slow, and somewhat unproductive...",train,GPT-2
99997,38428,False,1024,"""Flexibility Is Not an Option""\n\nAt The Nouri...",train,GPT-2
99998,129005,True,137,\nDuck Dynasty's Phil Robertson will have to m...,train,GPT-2
