**In this Repository, we process the generated text data and concatenate all the data in one csv file before using it for modelling**

In [4]:
import pandas as pd

# Preprocess Function

In this step, all the punctuations from the text are removed. string library of Python contains some pre-defined list of punctuations such as ‘!”#$%&'()*+,-./:;?@[\]^_`{|}~’

In [1]:
#library that contains punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [2]:
#defining the function to remove punctuation
import re
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
    
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


def preprocessing(text):
  x = re.sub(r'http\S+', '', text) # Remove all hyperlink starts with https
  x = remove_punctuation(x) # Remove all string punctuation
  x = deEmojify(x) #remove emoji character
  x = re.sub(r'[^a-zA-Z0-9]', ' ', x) # Remove all character which are not alphanumeric
  x = x.strip('\n') #Removes white space in the start and end of the text
  x = x.replace('\n','') # Removes any multiple white space in the document to single white space
  x = x.lower() # lowers the text
  x = " ".join(x.split()) # split each word and concatenate into a list
  
  return x
         

# GPT2 json

In [None]:
# Reading a GPT2 json file
df1 = pd.read_json('GPT2shakespeare.json')

In [None]:
#df1['clean_msg']= df1['text'].apply(lambda x:preprocessing(x))
df1['clean_msg']= df1['text'].apply(preprocessing)
df1.head()

Unnamed: 0,text,Label,clean_msg
0,and I will live all my life in a state of mou...,GPT2,and I will live all my life in a state of mou...
1,"When you're in a game, you don't have to worr...",GPT2,When youre in a game you dont have to worry a...
2,This is the only scenario where he's had any ...,GPT2,This is the only scenario where hes had any s...
3,"but it was enough, for I had already seen it....",GPT2,but it was enough for I had already seen it A...
4,He would have a good deal of money to spend o...,GPT2,He would have a good deal of money to spend o...


In [None]:
# change column posistion
df1 = df1.reindex(columns = ['clean_msg','Label','text'])
# Dropping uncleaned text column
df_gpt2 = df1.drop('text', axis=1)

In [None]:
#converting to a CSV file
df_gpt2.to_csv('GPT2shakespeare.csv', index = False, encoding='utf-8') # False: not include index

# GPT_neo

In [None]:
# reading GPT neo json file
df2 = pd.read_json('GPT_neo_jokes.json')
df2

Unnamed: 0,text,Label
0,"but I ask you, how shall I ever know?\n\nHow ...",GPTNeo
1,to the scene by a team of experts. With their...,GPTNeo
2,"never go to war,” she said.\n\n“I don’t think...",GPTNeo
3,a 'honest man'. I'm here to tell you that for...,GPTNeo
4,"\n That they could not stand in their way, w...",GPTNeo
...,...,...
495,", we should do the same. In the first place, h...",GPTNeo
496,", I can't sleep.\n\n– I'm going to see the gir...",GPTNeo
497,"\n\n""I can't tell you how much I want to drink...",GPTNeo
498,love for the moment was made to be. The searc...,GPTNeo


In [None]:
#storing the puntuation free text
#df2['clean_msg']= df2['text'].apply(lambda x: preprocessing(x))
df2['clean_msg']= df2['text'].apply(preprocessing)
df2.head()


Unnamed: 0,text,Label,clean_msg
0,"but I ask you, how shall I ever know?\n\nHow ...",GPTNeo,but i ask you how shall i ever know how can i ...
1,to the scene by a team of experts. With their...,GPTNeo,to the scene by a team of experts with their o...
2,"never go to war,” she said.\n\n“I don’t think...",GPTNeo,never go to war she said i don t think he had ...
3,a 'honest man'. I'm here to tell you that for...,GPTNeo,a honest man im here to tell you that for the ...
4,"\n That they could not stand in their way, w...",GPTNeo,that they could not stand in their way were th...


In [None]:
# change column posistion
df2 = df2.reindex(columns = ['clean_msg','Label','text'])
# Dropping uncleaned text column
df_gptneo = df2.drop('text', axis=1)

In [None]:
# converting to csv file
df_gptneo.to_csv('GPT2_neo_shakespeare.csv', index = False, encoding='utf-8')

# LSTM

In [None]:
df_lstm = pd.read_csv('LSTM_Generation.csv')
df_lstm

In [6]:
# aplly preprocessing function
df_lstm['Text'] = df_lstm['text'].apply(preprocessing)
# label the text as LSTM
df_lstm = df_lstm.assign(Label = 'LSTM')
# drop the uncleaned text
df_lstm.drop(columns=['text'], axis=1, inplace=True)

In [11]:
#example text
df_lstm['Text'][3]

'which thou hast often heard of and it is known to many in our land by the name of pitch this pitch as ancient writers do report doth defile so doth the company thou keepest for harry now i do not speak to thee in drink but in tears not in pleasure but in passion not in words only but in woes also and yet there is a virtuous man whom i have often noted in thy company but i know not his name what manner of man an it like your majesty a goodly portly man i faith and a corpulent of a cheerful look a pleasing eye and a most noble carriage and as i think his age some fifty or byr lady inclining to three score and now i remember me his name is falstaff if that man should be lewdly given he deceiveth me for harry i see virtue in his looks if then the tree may be known by the fruit as the fruit by the tree then peremptorily i speak it there is virtue in that falstaff him keep with the rest banish and tell me now thou naughty varlet tell me where hast thou been this month dost thou speak like a

In [12]:
# convert to csv file
df_lstm.to_csv('Lstm.csv',index=False)

# Data concatenation

In [13]:
# Read all files
df_human = pd.read_csv('/content/human_Shakespeare.csv')
df_gpt2 = pd.read_csv('/content/GPT2shakespeare.csv')
df_gptneo = pd.read_csv('/content/GPT2_neo_shakespeare.csv')
df_lstm = pd.read_csv('/content/Lstm.csv')

In [14]:
# rename content column into common word text for concatenation
df_gpt2.rename(columns={'clean_msg':"text"},inplace=True)
df_gptneo.rename(columns={'clean_msg':"text"},inplace=True)
df_lstm.rename(columns={'Text':"text"},inplace=True)
# Concatenate all 4 files
Dataset = pd.concat([df_human,df_lstm,df_gpt2,df_gptneo],axis=0)

In [15]:
# shuffle the file
Dataset= Dataset.sample(frac=1).reset_index(drop=True)

In [17]:
# convert the columns into separate text file which is the required format in textGCN model
Dataset['text'].to_csv(r'humanAI_sentence.txt', header=None, index=None)
Dataset['Label'].to_csv(r'humanAI.txt', header=None, index=None)

In [18]:
# save as CSV format
Dataset.to_csv('humanvsAI.csv',index=False)

In [16]:
Dataset

Unnamed: 0,text,Label
0,And yet methinks your grace hath not done well...,Human
1,exeunt prince henry and lancaster ill follow a...,LSTM
2,cause i could not give up but i was too tired...,GPT2
3,her hair i dont know said the old lady but i h...,GPTNeo
4,in the alfred of the keeper is a man of great ...,GPTNeo
...,...,...
1995,the first thing i want to do is to get you out...,GPTNeo
1996,i am here the way the whole world is looking a...,GPTNeo
1997,and all in the night even the moonlight and al...,GPTNeo
1998,"Time was, I did him a desired office, Dear alm...",Human
