# Text Generation using GPT2 model

In [None]:
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer

import pandas as pd
import re
%matplotlib inline

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
model = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id)

In [None]:
# Import supervised dataset

import pandas as pd
f_path = './dataset/SMSSpamCollection.csv'
dataset = pd.read_csv(f_path,sep='\t',names=["label","message"])

In [None]:
dataset.head(10)

In [None]:
# function for removing numbers and special characters

def text_preprocess(sen): 

   sen = re.sub('[^a-zA-Z]', ' ', sen)
   sen = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)
   sen = re.sub(r'\s+', ' ', sen)

   return sen

In [None]:
def text_generate(sentence):
    input_ids = tokenizer.encode(sentence, return_tensors='pt')

    # generate text until the output length (which includes the context length) reaches 50
    output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

    return tokenizer.decode(output[0], skip_special_tokens=True)[len(sentence)+1:]

In [None]:
X = dataset["message"]  

## Clearing the numbers and special characters and generating new sentences

In [None]:
# **** Takes about 90 hours ****

X_messages = [] 
messages = list(X) 
for mes in messages: 
    sentence = text_preprocess(mes)
    sen = text_generate(sentence)
    X_messages.append(sen)

### Save the new synthetic dataset

In [None]:
import pandas as pd

df = pd.DataFrame (X_messages, columns = ['message'])

df.head(10)
df[['message']].to_csv('SyntheticMessages.csv')


### Import the unsepervised synthetic dataset & basic supervised dataset

In [None]:
import pandas as pd
f_path = './dataset/SyntheticMessages.csv'
synthetic_dataset = pd.read_csv(f_path)

f_path = './dataset/SMSSpamCollection.csv'
dataset = pd.read_csv(f_path,sep='\t',names=["label","message"])

In [None]:
synthetic_dataset.head(10)

### Data Preprocessing

In [None]:
# removing numbers and special characters

def text_preprocess(sen): 

   sen = re.sub('[^a-zA-Z]', ' ', sen)
   sen = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)
   sen = re.sub(r'\s+', ' ', sen)

   return sen

In [None]:
X = dataset["message"] 

In [None]:
# Convert dataframe to list after preprocessing
X_messages = [] 
messages = list(X) 
for mes in messages: 
    X_messages.append(text_preprocess(mes))

In [None]:
# Converting Text to Numbers
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf_vec = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
X = tfidf_vec.fit_transform(X_messages)

In [None]:
X_messages[0]

## Predict synthetic dataset with all supervised model

In [None]:
# load trained models

import pickle

with open('./trained_models/NaiveBayes_model', 'rb') as f:
    nb_model = pickle.load(f)

with open('./trained_models/randomForest_model', 'rb') as f:
    rf_model = pickle.load(f)

with open('./trained_models/SVM_model', 'rb') as f:
    svm_model = pickle.load(f)

with open('./trained_models/KNeighbors_model', 'rb') as f:
    kn_model = pickle.load(f)

with open('./trained_models/DecisionTree_model', 'rb') as f:
    dt_model = pickle.load(f)

with open('./trained_models/Bagging_model', 'rb') as f:
    b_model = pickle.load(f)

with open('./trained_models/AdaBoost_model', 'rb') as f:
    ab_model = pickle.load(f)

In [None]:
# Convert the messages of synthetic dataset to list
df = list(synthetic_dataset['message'])

# Convert the label of basic dataset to list
label_expected = list(dataset['label'])

In [None]:
# Checks with 7 algorithems if the SMS is spam or not

def isSpam(sms):
    ham = 0
    spam = 0
    if nb_model.predict(sms) == 'ham': ham += 1
    else: spam += 1

    if rf_model.predict(sms) == 'ham': ham += 1
    else: spam += 1

    if svm_model.predict(sms) == 'ham': ham += 1
    else: spam += 1

    if kn_model.predict(sms) == 'ham': ham += 1
    else: spam += 1

    if dt_model.predict(sms) == 'ham': ham += 1
    else: spam += 1

    if b_model.predict(sms) == 'ham': ham += 1
    else: spam += 1

    if ab_model.predict(sms) == 'ham': ham += 1
    else: spam += 1

    return spam > ham
    

In [None]:
def isNaN(string):
    return string != string

In [None]:
# Classification for messages we created with the GPT2
dataset_with_label = []

for i in range(0,len(df)-1):
    if isNaN(df[i]):
        continue
    else:
        sen = tfidf_vec.transform([df[i]]).toarray() # Fit the Data
        if isSpam(sen):
            row = [df[i], 'spam']
            if label_expected[i] == 'spam':
                dataset_with_label.append(row)
        else: 
            row = [df[i], 'ham']
            if label_expected[i] == 'ham':
                dataset_with_label.append(row)


In [None]:
# Convert list of synthetic with label to dataframe
df_with_label = pd.DataFrame (dataset_with_label, columns = ['message','label'])

In [None]:
# Save the new synthetic dataset with label
df_with_label.to_csv('SyntheticMessages_WithLabel.csv')

In [None]:
# unsupervised dataset

import pandas as pd
f_path = './dataset/SyntheticMessages_WithLabel.csv'
synthetic_dataset = pd.read_csv(f_path)

In [None]:
synthetic_dataset.head(10)