In [46]:
import pandas as pd
import glob
import lxml.html
import random
import itertools

In [47]:
def munge_gpt3(files, chunk=True):
    dfs = []
    for file in files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 15, text_list))
        df = pd.DataFrame({'text': text_list,
                       'generated': 1})
        dfs.append(df)

    gpt = pd.concat(dfs, ignore_index=True)
    gpt.to_csv('gpt3.csv', index=False)
    return gpt

In [48]:
munge_gpt3(glob.glob('../gpt_scraper/data/*.txt'))

Unnamed: 0,text,generated
0,"In recent years, Hong Kong has undergone signi...",1
1,Hong Kong has long been a vibrant hub of comme...,1
2,"In addition to political turmoil, Hong Kong is...",1
3,"Given these challenges, it’s not surprising th...",1
4,"However, leaving Hong Kong is not an option fo...",1
...,...,...
16861,This is where the Art-Lover Alert comes in - a...,1
16862,The Voice may be a passing comment from a fell...,1
16863,The Tap on the Shoulder refers to those moment...,1
16864,"These small, seemingly insignificant moments o...",1


In [37]:
def concatenate_gpt(files, chunk = True):
    dfs = []
    for file in files:
        raw_df = pd.read_csv(file)
        text_list = [str(t).strip() for t in raw_df['text']]
        if chunk:
            text_list = [t.split('\n') for t in text_list]
            text_list = list(itertools.chain(*text_list))
            text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 10, text_list))
        df = pd.DataFrame({'text': text_list,
                       'generated': 1})
        dfs.append(df)

    gpt = pd.concat(dfs, ignore_index=True)
    gpt.to_csv('gpt.csv', index=False)
    return gpt

In [38]:
concatenate_gpt(glob.glob('gpt_data/xl-1542M-k40.test.csv'))

Unnamed: 0,text,generated
0,When I first heard about what a lot of people ...,1
1,And so I am quite surprised to hear of this ne...,1
2,This is a blatant attempt to limit trans* righ...,1
3,"First, let's talk about religious liberty. And...",1
4,"Now, you may be saying, 'But that doesn't seem...",1
...,...,...
54178,"Of course, the inequality gap is not just in a...",1
54179,The growth of wealth inequality has become a g...,1
54180,"In short, wealth inequality within and between...",1
54181,This is where public welfare measures to ensur...,1


In [9]:
def parse_blog_xml(file):
    tree = lxml.html.parse(file)
    post_els = tree.findall('.//post')
    posts = [p.text.replace('urlLink', '').strip() for p in post_els]
    return posts

In [20]:
BLOG_FILE_NUM = 0 # change this to add blog data
def process_human(fake=False):
    dfs = []

    # files = glob.glob('nyt_data/*.csv')
    # for file in files:
    #     raw_df = pd.read_csv(file)
    #     text_list = [str(t).strip() for t in raw_df['abstract']]
    #     generated = [1 if random.random() < 0.5 else 0 for _ in text_list] if fake else 0
    #     df = pd.DataFrame({'text': text_list,
    #                    'generated': generated})
    #     dfs.append(df)

    nyt_scraped_files = glob.glob('../nyt_scraper/data/*.txt')
    for file in nyt_scraped_files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = text_list[1:] # skip titles
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 10, text_list))
        generated = [1 if random.random() < 0.5 else 0 for _ in text_list] if fake else 0
        df = pd.DataFrame({'text': text_list,
                       'generated': generated})
        dfs.append(df)

    # blog_files = glob.glob('blogs/*.xml')[:BLOG_FILE_NUM]
    # for file in blog_files:
    #     posts = parse_blog_xml(file)
    #     generated = [1 if random.random() < 0.5 else 0 for _ in posts] if fake else 0
    #     df = pd.DataFrame({'text': posts,
    #                    'generated': generated})
    #     dfs.append(df)

    human = pd.concat(dfs, ignore_index=True)
    outfile = 'fake.csv' if fake else 'human.csv'
    human.to_csv(outfile, index=False)
    return human

In [21]:
# process_human(True)
human = process_human(False)

In [39]:
human

Unnamed: 0,text,generated
0,WASHINGTON — President Trump held what he call...,0
1,"“I have many issues,” Mr. Trump said in announ...",0
2,"To mark the occasion, several high-ranking adm...",0
3,"The executive order, according to White House,...",0
4,"According to Polaris, a nonprofit organization...",0
...,...,...
94264,"The program, The Globe wrote, was “a penetrati...",0
94265,Ms. Jarvis began filming in China in August 19...,0
94266,Ms. Jarvis left NBC in 1976 and founded her ow...,0
94267,"In addition to “Junon and Avos,” which Ms. Jar...",0
