In [73]:
import pandas as pd
import glob
import lxml.html
import random
import itertools
import math
import nltk
import string
from collections import Counter
from nltk.corpus import wordnet as w
MIN_SENT_LENGTH = 5

## Creating Features

In [74]:
# Feature extraction functions
def sentence_length(text):
    text = str(text)
    sentences = nltk.sent_tokenize(text)
    numberofsentences = len(sentences)
    total_words = 0
    for i in sentences:
        total_words += len(i.split())
    avg_sentence = total_words / numberofsentences
    return numberofsentences, avg_sentence

def repetitivewords(text):
    text = str(text)
    token = nltk.word_tokenize(text.lower())
    synsets = []
    for i in token:
        synsets.extend(w.synsets(i))
    synonyms = []
    for synset in synsets:
        synonyms.append([lemma.name() for lemma in synset.lemmas()])
    repeat = 0
    for index in range(len(synonyms)):
        for nextindex in range(index+1, len(synonyms)):
            if len(set(synonyms[index]) & set(synonyms[nextindex])) > 0:
                repeat += 1
    return repeat / len(token)

def entropy(text):
    text = str(text)
    tokens = nltk.word_tokenize(text.lower())
    tokennumber = Counter(tokens)
    total = len(tokens)
    numberofprobs = []
    for count in tokennumber.values():
        prob = count / total
        numberofprobs.append(prob)
    entropy = 0.0
    for i in numberofprobs:
        if i > 0:
            entropy -= i * (math.log(i, 2))
    return entropy

def avg_punctuation():
    text = str(text)
    num_sent = len(nltk.sent_tokenize(text))
    tokens = nltk.word_tokenize(text.lower())
    punc_count = 0
    # what about the ' in there's or I'am ? or quotation mark? or numbered lists?
    for t in tokens:
        punc_count += 1 if t in string.punctuation else 0
    return punc_count / num_sent

### 1. gpt-3.5-turbo

In [90]:
def munge_turbo(files, set_features = False):
    dfs = []
    for file in files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= MIN_SENT_LENGTH, text_list))
        df = pd.DataFrame({'text': text_list,
                       'generated': 1})
        if set_features:
            df['sent_length'], df['avg_sent_length'] = zip(*df['text'].apply(sentence_length))
            df['repetitive_words'] = df['text'].apply(repetitivewords)
            df['text_entropy'] = df['text'].apply(entropy)
        dfs.append(df)
    gpt = pd.concat(dfs, ignore_index=True)
    gpt.to_csv('turbo.csv', index=False)
    return gpt
munge_turbo(glob.glob('../turbo_generator/data/*.txt'))

Unnamed: 0,text,generated
0,"As Hong Kong's political turmoil continues, it...",1
1,"In June 2019, the Hong Kong government propose...",1
2,"The international community has spoken out, bu...",1
3,"According to The New York Times, wealthy Hong ...",1
4,This phenomenon of wealthy residents fleeing t...,1
...,...,...
6270,"Firstly, Suleimani was a key figure in Iran's ...",1
6271,"Secondly, Suleimani was known to be a close co...",1
6272,"However, some analysts argue that the U.S. may...",1
6273,"Furthermore, there are concerns that the U.S. ...",1


In [51]:
def munge_gpt3(files):
    dfs = []
    for file in files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 15, text_list))
        df = pd.DataFrame({
                    'text': text_list,
                       'generated': 1,
                    #    'sent_length':[],
                    #    'avg_sent_length':[],
                    #    'repetitive_words':[],
                    #    'text_entropy':[],
                    #    'avg_punctuation' :[]
                       })
        df['sent_length'], df['avg_sent_length'] = zip(*df['text'].apply(sentence_length))
        df['repetitive_words'] = df['text'].apply(repetitivewords)
        df['text_entropy'] = df['text'].apply(entropy)
        dfs.append(df)

    gpt = pd.concat(dfs, ignore_index=True)
    gpt.to_csv('gpt3.csv', index=False)
    return gpt

In [37]:
def concatenate_gpt(files, chunk = True):
    dfs = []
    for file in files:
        raw_df = pd.read_csv(file)
        text_list = [str(t).strip() for t in raw_df['text']]
        if chunk:
            text_list = [t.split('\n') for t in text_list]
            text_list = list(itertools.chain(*text_list))
            text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 10, text_list))
        df = pd.DataFrame({'text': text_list,
                       'generated': 1})
        dfs.append(df)

    gpt = pd.concat(dfs, ignore_index=True)
    gpt.to_csv('gpt.csv', index=False)
    return gpt

In [38]:
concatenate_gpt(glob.glob('gpt_data/xl-1542M-k40.test.csv'))

Unnamed: 0,text,generated
0,When I first heard about what a lot of people ...,1
1,And so I am quite surprised to hear of this ne...,1
2,This is a blatant attempt to limit trans* righ...,1
3,"First, let's talk about religious liberty. And...",1
4,"Now, you may be saying, 'But that doesn't seem...",1
...,...,...
54178,"Of course, the inequality gap is not just in a...",1
54179,The growth of wealth inequality has become a g...,1
54180,"In short, wealth inequality within and between...",1
54181,This is where public welfare measures to ensur...,1


In [9]:
def parse_blog_xml(file):
    tree = lxml.html.parse(file)
    post_els = tree.findall('.//post')
    posts = [p.text.replace('urlLink', '').strip() for p in post_els]
    return posts

In [20]:
BLOG_FILE_NUM = 0 # change this to add blog data
def process_human(fake=False):
    dfs = []

    # files = glob.glob('nyt_data/*.csv')
    # for file in files:
    #     raw_df = pd.read_csv(file)
    #     text_list = [str(t).strip() for t in raw_df['abstract']]
    #     generated = [1 if random.random() < 0.5 else 0 for _ in text_list] if fake else 0
    #     df = pd.DataFrame({'text': text_list,
    #                    'generated': generated})
    #     dfs.append(df)

    nyt_scraped_files = glob.glob('../nyt_scraper/data/*.txt')
    for file in nyt_scraped_files:
        lines = open(file).readlines()
        text_list = [l.strip() for l in lines]
        text_list = text_list[1:] # skip titles
        text_list = list(filter(lambda t: t!= "" and len(t.split()) >= 10, text_list))
        generated = [1 if random.random() < 0.5 else 0 for _ in text_list] if fake else 0
        df = pd.DataFrame({'text': text_list,
                       'generated': generated})
        dfs.append(df)

    # blog_files = glob.glob('blogs/*.xml')[:BLOG_FILE_NUM]
    # for file in blog_files:
    #     posts = parse_blog_xml(file)
    #     generated = [1 if random.random() < 0.5 else 0 for _ in posts] if fake else 0
    #     df = pd.DataFrame({'text': posts,
    #                    'generated': generated})
    #     dfs.append(df)

    human = pd.concat(dfs, ignore_index=True)
    outfile = 'fake.csv' if fake else 'human.csv'
    human.to_csv(outfile, index=False)
    return human

In [21]:
# process_human(True)
human = process_human(False)

In [39]:
human

Unnamed: 0,text,generated
0,WASHINGTON — President Trump held what he call...,0
1,"“I have many issues,” Mr. Trump said in announ...",0
2,"To mark the occasion, several high-ranking adm...",0
3,"The executive order, according to White House,...",0
4,"According to Polaris, a nonprofit organization...",0
...,...,...
94264,"The program, The Globe wrote, was “a penetrati...",0
94265,Ms. Jarvis began filming in China in August 19...,0
94266,Ms. Jarvis left NBC in 1976 and founded her ow...,0
94267,"In addition to “Junon and Avos,” which Ms. Jar...",0
