# Functions!

In [75]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from textblob import TextBlob
from gensim.models import LsiModel
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import textstat


In [48]:
def basic_cleaning(text):
    # remove whitespace
    prepoc_text = text.strip()
    # lowercasing
    prepoc_text = prepoc_text.lower()
    # remove digits
    prepoc_text = "".join(char for char in prepoc_text if not char.isdigit())
    # remove punctuation
    for punctuation in string.punctuation:
        prepoc_text = prepoc_text.replace(punctuation," ")
    # remove regex
    prepoc_text = re.sub('<[^<]+?',"",prepoc_text)

    return prepoc_text

In [49]:
def cons_density(text):

    consonnant = sum(1 for char in text if char.isalpha() and char not in "aeiou")
    vowel = sum(1 for char in text if char.isalpha() and char in "aeiou")
    return round((consonnant/(vowel + consonnant)),3)

In [69]:
def get_word_stress(word):
    cmu_dict = cmudict.dict()
    if word in cmu_dict:
        return sum(int(char) for syllable in cmu_dict[word][0] for char in syllable if char.isdigit())
    return 0

def get_sentence_stress(sentence):
    words = sentence.split()
    stress_values = [get_word_stress(word) for word in words]
    return sum(stress_values)

In [51]:
def redundance(text):
    # give a redundance score, considering the lenght of each text, if a lemmatized words appears more than three times the mean, it is considered redundant.

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    clean_tokens = [w for w in tokens if w not in stop_words]

    lemmatizer = WordNetLemmatizer()
    verb_lemmas = {word: lemmatizer.lemmatize(word, pos='v') for word in clean_tokens}

    final_lemmas = [lemmatizer.lemmatize(lemma, pos='n') if lemma == word else lemma
                    for word, lemma in verb_lemmas.items()]

    word_counts = Counter(final_lemmas)
    mean_freq = sum(word_counts.values()) / len(word_counts)

    score = sum(1 for word, count in word_counts.items() if count > 3 * mean_freq)

    return score

In [52]:
def sentiment_polarity(text):
    sent_pol = TextBlob(text).sentiment.polarity
    return abs(round(sent_pol,3))

In [53]:
def word_choice(text):
    common_ai_words =["commendable",'transhumanist', 'meticulous', 'elevate','hello', 'tapestry' 'leverage',
                  'journey', 'headache','resonate','testament','explore', 'binary','delve',
                  'enrich', 'seamless','multifaceted', 'sorry','foster', 'convey', 'beacon',
                  'interplay', 'oh', 'navigate','form','adhere','cannot', 'landscape','remember',
                  'paramount', 'comprehensive', 'placeholder','grammar','real','summary','symphony',
                  'furthermore','relationship','ultimately','profound','art','supercharge','evolve',
                  'beyoud','reimagine','vibrant', 'robust','pivotal','certainly','quinoa','orchestrate','align',
                  'diverse','recommend','annals','note','employ','bustling','indeed','digital','enigma', 'outfit',
                  'indelible','refrain','culture','treat','emerge','meticulous','esteemed','weight','whimsical','bespoke',
                  'highlight','antagonist','unlock','key','breakdown','tailor','misinformation','treasure','paradigm','captivate',
                  'song','underscore','calculate','especially','climate','hedging','inclusive','exercise','ai','embrace',
                  'level','nuance','career','dynamic','accent','ethos','cheap','firstly','online','goodbye'
                  ]
    word_count = 0
    for word in text.split():
        if word in common_ai_words:
            word_count += 1
        else: pass

    return word_count

In [54]:
def coherence(text):
    # uses gensim to measure coherence, use the lsi model(latent semantic indexing, coherence c_v because we provide the text)
    tokens = word_tokenize(text)
    dictionary = corpora.Dictionary([tokens])
    corpus_gensim = [dictionary.doc2bow(tokens)]
    lsa_model = LsiModel(corpus_gensim, id2word=dictionary)

    coherence_model = CoherenceModel(
        model=lsa_model,
        texts=[tokens],
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_score = coherence_model.get_coherence()
    return coherence_score

In [55]:
def reading_ease(text):
    reading_ease= textstat.flesch_reading_ease(text)
    return reading_ease


def gunning_fog(text):
    gunning_fog = textstat.gunning_fog(text)
    return gunning_fog

# Create Pipeline
We want to add columns, not transform them ==> no ColumnTransformer <br>
Function transformer?<br>
But firt we need to get our preprocessed data...

In [56]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [57]:
data_load = pd.read_csv("/home/romaric/code/nghia95/fake-data-detector/data/1k_sampled_dataset.csv")
data = data_load.copy()
data.head()

Unnamed: 0,text,source,prompt_id,text_length,word_count
0,The Philosophy and Ethics of Transhumanism\n\n...,GPT-3.5,1920,2558,394
1,Crime-tracking app Citizen is launching its ow...,Flan-T5-XXL,0,378,62
2,The court in Novorossiysk gave two of the danc...,GLM-130B,0,621,109
3,"then drops the drumsticks, poses, then walks o...",GPT-J,0,513,90
4,On tally went to the beach. She found a sand d...,GPT-J,0,4984,846


In [58]:
data["AI_gen"] = data["source"].apply(lambda x: 0 if x == "Human" else 1)

In [59]:
#X=pd.DataFrame(data["text"])
X=pd.DataFrame(data["text"][:3])
y=data["AI_gen"]

In [83]:
X.head()

Unnamed: 0,text
0,The Philosophy and Ethics of Transhumanism\n\n...
1,Crime-tracking app Citizen is launching its ow...
2,The court in Novorossiysk gave two of the danc...


In [None]:
# class TextPreprocessor(BaseEstimator, TransformerMixin):
#     "Applies basic cleaning to text."
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return pd.DataFrame({"preprocessed": X["text"].apply(basic_cleaning)})

# class ConsDensity(BaseEstimator, TransformerMixin):
#     "Extracts consonant density from preprocessed text."
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return pd.DataFrame({"cons_density": X["preprocessed"].apply(cons_density)})

# class Stress(BaseEstimator, TransformerMixin):
#     "Extracts sentence stress values."
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return pd.DataFrame({"stress_value": X["preprocessed"].apply(get_sentence_stress)})

# class Sentiment(BaseEstimator, TransformerMixin):
#     "Extracts sentiment score."
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return pd.DataFrame({"sentiment_score": X["preprocessed"].apply(sentiment_polarity)})

# class Redundance(BaseEstimator, TransformerMixin):
#     "Extracts redundancy score from text."
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return pd.DataFrame({"redundance": X["preprocessed"].apply(redundance)})

# class UnusualWord(BaseEstimator, TransformerMixin):
#     "Extract the number of unusual word from text"
#     def fit(self, X, y = None):
#         return self
#     def transform(self,X):
#         return pd.DataFrame({"unusual_words_count": X["preprocessed"].apply(word_choice)})

# class Coherence(BaseEstimator, TransformerMixin):
#     "Return the measured coherence of a text"
#     def fit(self, X, y = None):
#         return self
#     def transform(self,X):
#         return pd.DataFrame({"coherence_score": X["preprocessed"].apply(coherence)})

# class ReadingEase(BaseEstimator, TransformerMixin):
#     "Return the measured reading ease score of a text"
#     def fit(self, X, y = None):
#         return self
#     def transform(self,X):
#         return pd.DataFrame({"reading_ease": X["text"].apply(reading_ease)})

# class GunningFog(BaseEstimator, TransformerMixin):
#     "Return the gunning fog score of a text"
#     def fit(self, X, y = None):
#         return self
#     def transform(self,X):
#         return pd.DataFrame({"gunningfog": X["text"].apply(gunning_fog)})


In [76]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    "Applies basic cleaning to text."
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({"preprocessed": np.vectorize(basic_cleaning)(X["text"])})

class ConsDensity(BaseEstimator, TransformerMixin):
    "Extracts consonant density from preprocessed text."
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({"cons_density":  np.vectorize(cons_density)(X["preprocessed"])})

class Stress(BaseEstimator, TransformerMixin):
    "Extracts sentence stress values."
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({"stress_value":  np.vectorize(get_sentence_stress)(X["preprocessed"])})

class Sentiment(BaseEstimator, TransformerMixin):
    "Extracts sentiment score."
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({"sentiment_score":  np.vectorize(sentiment_polarity)(X["preprocessed"])})

class Redundance(BaseEstimator, TransformerMixin):
    "Extracts redundancy score from text."
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({"redundance":  np.vectorize(redundance)(X["preprocessed"])})

class UnusualWord(BaseEstimator, TransformerMixin):
    "Extract the number of unusual word from text"
    def fit(self, X, y = None):
        return self
    def transform(self,X):
        return pd.DataFrame({"unusual_words_count":  np.vectorize(word_choice)(X["preprocessed"])})

class Coherence(BaseEstimator, TransformerMixin):
    "Return the measured coherence of a text"
    def fit(self, X, y = None):
        return self
    def transform(self,X):
        return pd.DataFrame({"coherence_score":  np.vectorize(coherence)(X["preprocessed"])})

class ReadingEase(BaseEstimator, TransformerMixin):
    "Return the measured reading ease score of a text"
    def fit(self, X, y = None):
        return self
    def transform(self,X):
        return pd.DataFrame({"reading_ease":  np.vectorize(reading_ease)(X["text"])})

class GunningFog(BaseEstimator, TransformerMixin):
    "Return the gunning fog score of a text"
    def fit(self, X, y = None):
        return self
    def transform(self,X):
        return pd.DataFrame({"gunningfog":  np.vectorize(gunning_fog)(X["text"])})


In [80]:
pipeline = Pipeline([
    ("preprocessor", TextPreprocessor()),
    ("features", FeatureUnion([
        ("cons_density", ConsDensity()),
        ("stress_value", Stress()),
        ("sentiment_score", Sentiment()),
        ("redundance", Redundance()),
        ("unusualword",UnusualWord()),
        ("coherence", Coherence()),
        ("readingease", ReadingEase()),
        ("gunningfog", GunningFog()),

    ]))
])

In [78]:
pipeline

In [82]:
X_transformed = pipeline.fit_transform(X)
X = pd.concat([X, X_transformed], axis=1)
print(X.head())

KeyboardInterrupt: 