In [32]:
import pandas as pd
import glob
import lxml.html
import random
import itertools
import math
import nltk
import string
from collections import Counter
from nltk.corpus import wordnet as w
MIN_SENT_LENGTH = 10
ENTRY_NUMBER = 5000
random_state = 114514

## Creating Features

In [33]:
# Feature extraction functions
def sentence_length(text):
    text = str(text)
    sentences = nltk.sent_tokenize(text)
    numberofsentences = len(sentences)
    total_words = 0
    for i in sentences:
        total_words += len(i.split())
    avg_sentence = total_words / numberofsentences
    return numberofsentences, avg_sentence

def repetitivewords(text):
    text = str(text)
    token = nltk.word_tokenize(text.lower())
    synsets = []
    for i in token:
        synsets.extend(w.synsets(i))
    synonyms = []
    for synset in synsets:
        synonyms.append([lemma.name() for lemma in synset.lemmas()])
    repeat = 0
    for index in range(len(synonyms)):
        for nextindex in range(index+1, len(synonyms)):
            if len(set(synonyms[index]) & set(synonyms[nextindex])) > 0:
                repeat += 1
    return repeat / len(token)

def entropy(text):
    text = str(text)
    tokens = nltk.word_tokenize(text.lower())
    tokennumber = Counter(tokens)
    total = len(tokens)
    numberofprobs = []
    for count in tokennumber.values():
        prob = count / total
        numberofprobs.append(prob)
    entropy = 0.0
    for i in numberofprobs:
        if i > 0:
            entropy -= i * (math.log(i, 2))
    return entropy

def avg_punctuation():
    text = str(text)
    num_sent = len(nltk.sent_tokenize(text))
    tokens = nltk.word_tokenize(text.lower())
    punc_count = 0
    # what about the ' in there's or I'am ? or quotation mark? or numbered lists?
    for t in tokens:
        punc_count += 1 if t in string.punctuation else 0
    return punc_count / num_sent

### 1. gpt-3.5-turbo

In [34]:
def munge_turbo(files, set_features = False):
    dfs = []
    for file in files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= MIN_SENT_LENGTH, text_list))
        df = pd.DataFrame({'text': text_list,
                       'generated': 1})
        if set_features:
            df['sent_length'], df['avg_sent_length'] = zip(*df['text'].apply(sentence_length))
            df['repetitive_words'] = df['text'].apply(repetitivewords)
            df['text_entropy'] = df['text'].apply(entropy)
        dfs.append(df)
    gpt = pd.concat(dfs, ignore_index=True)
    if ENTRY_NUMBER:
        gpt = gpt.sample(ENTRY_NUMBER, random_state=random_state)
    gpt.to_csv('turbo.csv', index=False)
    return gpt
munge_turbo(glob.glob('../turbo_generator/data/*.txt'))

Unnamed: 0,text,generated
1316,One of the film’s chief strengths is its perfo...,1
9,The two crashes that claimed the lives of 346 ...,1
1462,"In conclusion, the recent video showing the se...",1
3289,"With such overwhelming devastation, it can be ...",1
3254,"In the 1970s, Kirstein turned his attention to...",1
...,...,...
5117,The escalation of tensions between the US and ...,1
5543,The use of language in a court of law is cruci...,1
3657,Among the nominees for Best Original Screenpla...,1
784,"Iran has been a hot topic in the news lately, ...",1


In [35]:
def munge_gpt3(files):
    dfs = []
    for file in files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 15, text_list))
        df = pd.DataFrame({
                    'text': text_list,
                    'generated': 1,
                       })
        dfs.append(df)

    gpt = pd.concat(dfs, ignore_index=True)
    if ENTRY_NUMBER:
        gpt = gpt.sample(ENTRY_NUMBER, random_state=random_state)
    gpt.to_csv('scraped_gpt3.csv', index=False)
    return gpt
munge_gpt3(glob.glob('../gpt_scraper/data/*.txt'))

Unnamed: 0,text,generated
13413,Another issue was the controversy surrounding ...,1
9426,The move was met with mixed reactions from the...,1
7257,"But as an outsider looking in, I can’t help bu...",1
845,"The answer, according to many Star Trek enthus...",1
3271,"In conclusion, the Five Star Movement’s risk o...",1
...,...,...
16509,"Today, the remnants of the Holy Land’s railway...",1
7185,"Once we find our pack, it’s important to nurtu...",1
9650,One of the standout dishes at Chao Thai Kitche...,1
7620,Da Toscano is the brainchild of Chef Michael T...,1


In [36]:
def munge_gpt(keyword, chunk = True):
    dfs = []
    files = glob.glob(f'gpt_data/{keyword}*.csv')
    for file in files:
        raw_df = pd.read_csv(file)
        text_list = [str(t).strip() for t in raw_df['text']]
        if chunk:
            text_list = [t.split('\n') for t in text_list]
            text_list = list(itertools.chain(*text_list))
            text_list = list(filter(lambda t: t!= "" and len(t.split()) >= MIN_SENT_LENGTH, text_list))
        df = pd.DataFrame({'text': text_list,
                       'generated': 1})
        dfs.append(df)

    gpt = pd.concat(dfs, ignore_index=True)
    if ENTRY_NUMBER:
        gpt = gpt.sample(ENTRY_NUMBER, random_state=random_state)
    gpt.to_csv(f'{keyword}.csv', index=False)
    return gpt

In [37]:
munge_gpt('small-117M-k40')
munge_gpt('small-117M')
munge_gpt('medium-345M-k40')
munge_gpt('medium-345M')
munge_gpt('large-762M-k40')
munge_gpt('large-762M')
munge_gpt('xl-1542M-k40')
munge_gpt('xl-1542M')

Unnamed: 0,text,generated
165307,The Federal Home and Community-Based Services ...,1
1969,Snake & Pistol The monkey mask is in a hole to...,1
170212,I am still waiting to hear back on two of the ...,1
183081,"We have not been able to solve this case, but ...",1
72960,McKenna is getting more benefits than just wit...,1
...,...,...
41502,This section concerns content related to Warcr...,1
131318,The U.S. stationing of an anti-missile defense...,1
105242,The 2099 panel is one of the highlights of thi...,1
45921,Jackson County officials are interested in own...,1


In [38]:
BLOG_FILE_NUM = 0 # we will be using scraped data only
def parse_blog_xml(file):
    tree = lxml.html.parse(file)
    post_els = tree.findall('.//post')
    posts = [p.text.replace('urlLink', '').strip() for p in post_els]
    return posts

def get_blog_dfs(fake=False):
    dfs = []
    blog_files = glob.glob('blogs/*.xml')[:BLOG_FILE_NUM]
    for file in blog_files:
        posts = parse_blog_xml(file)
        generated = [1 if random.random() < 0.5 else 0 for _ in posts] if fake else 0
        df = pd.DataFrame({'text': posts,
                       'generated': generated})
        dfs.append(df)
    return dfs

In [39]:
def process_human(fake=False):
    dfs = []
    nyt_scraped_files = glob.glob('../nyt_scraper/data/*.txt')
    for file in nyt_scraped_files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = text_list[1:] # skip titles
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 10, text_list))
        generated = [1 if random.random() < 0.5 else 0 for _ in text_list] if fake else 0
        df = pd.DataFrame({'text': text_list,
                       'generated': generated})
        dfs.append(df)

    human = pd.concat(dfs, ignore_index=True)
    outfile = 'fake.csv' if fake else 'human.csv'
    if ENTRY_NUMBER:
        human = human.sample(ENTRY_NUMBER, random_state=random_state)
    human.to_csv(outfile, index=False)
    return human

In [40]:
human = process_human(False)
human = process_human(True)