# Pre-Processing for Decision Tree

In [1]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import statistics
import pandas as pd
import re
import os, glob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lihon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#This function return the number of words in a string
def words_num(paragraph):
    tokens = word_tokenize(paragraph)
    return len(tokens)

In [3]:
#This function analyze the structure of a given string
#This function returns the number of sentence, number of sentence < 11 words, number of sentence > 34
#standard dev in sentence length, length difference in consective sentence
def sentence_num(paragraph):
    number_of_sentences = sent_tokenize(paragraph)
    num_sentense_low = 0
    num_sentense_high = 0
    list_word_num = [words_num(x) for x in number_of_sentences]
    list_consective_diff = [abs(t - s) for s, t in zip(list_word_num, list_word_num[1:])]
    for i in list_word_num:
        if i < 11:
            num_sentense_low += 1
        elif i > 34:
            num_sentense_high += 1
    return len(number_of_sentences), num_sentense_low, num_sentense_high, sum(list_consective_diff)/len(number_of_sentences), statistics.pstdev(list_word_num)

In [4]:
#Create the dataframe for storing features for model training
df = pd.DataFrame(columns = ["sentence_per_paragraph", "words_per_paragraph", ")present", "-present", ";or:present", "?present", "standard_dev_in_sentence_len",
                             "length_diff_in_consective_sentence", "sentence<11", "sentence>34", "contains_although", "contains_However", "contains_but",
                             "contains_because", "contains_this", "contains_others_or_researchers", "contains_num", "contains_more_capitals", "contains_et", 
                             "class"])

In [5]:
#This function takes a string and extract all features needed for model training
def add_to_df(sentences, fake_or_real):
    words = words_num(sentences)

    sent, low, high, consec, dev = sentence_num(sentences)
    lowered = sentences.lower()

    df.loc[len(df.index)] = [sent, words, ")" in sentences, "-" in sentences, ";" in sentences or ":" in sentences, "?" in sentences, dev, consec, low, high, 
                            "although" in lowered, "however" in lowered, "but" in lowered, "because" in lowered, "this" in lowered, "others" in lowered or 
                            "researchers" in lowered, bool(re.search(r'\d', sentences)), len(re.findall(r'[A-Z]', sentences)) > 2 * len(re.findall("\.", sentences)),
                            "et" in lowered, fake_or_real]

In [6]:
#path for training data set
path_fake = "\\fake_arts"
path_fake = os.getcwd() + path_fake
path_real = "\\real_arts"
path_real = os.getcwd() + path_real

In [7]:
#extract all features for ai-generated content
for fileName in glob.glob(os.path.join(path_fake, "*.txt")):
    with open(os.path.join(os.getcwd(), fileName), 'r', encoding="cp437") as f:
        add_to_df(f.read(), 0)

In [8]:
#extract all features for humen written content
for fileName in glob.glob(os.path.join(path_real, "*.txt")):
    with open(os.path.join(os.getcwd(), fileName), 'r', encoding="cp437") as f:
        add_to_df(f.read(), 1)

In [9]:
#export result as an csv file
df.to_csv("testdata.csv", encoding='utf-8', index=False)