# Functions!

In [28]:
import pandas as pd
import numpy as np
import string
import re
import nltk
nltk.data.path.append("/home/romaric/code/nghia95/fake-data-detector/notebooks/roma_NTLK_Data_Cache")
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from textblob import TextBlob
from gensim.models import LsiModel
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import textstat


In [29]:

cmu_dict = cmudict.dict()  # This should load from the cache
print(cmu_dict["hello"])

[['HH', 'AH0', 'L', 'OW1'], ['HH', 'EH0', 'L', 'OW1']]


In [30]:
def word_count(text):
    if not isinstance(text, str):  # Convert to string if it's not
       text = str(text)
    return len(text.split())

In [31]:
def basic_cleaning(text):
    if not isinstance(text, str):  # Convert to string if it's not
       text = str(text)
    # Remove whitespace
    prepoc_text = text.strip()
    # Lowercasing
    prepoc_text = prepoc_text.lower()
    # remove digits
    prepoc_text = "".join(char for char in prepoc_text if not char.isdigit())
    # remove punctuation
    for punctuation in string.punctuation:
        prepoc_text = prepoc_text.replace(punctuation," ")
    # remove regex
    prepoc_text = re.sub('<[^<]+?',"",prepoc_text)

    return prepoc_text

In [32]:
def cons_density(text):

    consonnant = sum(1 for char in text if char.isalpha() and char not in "aeiou")
    vowel = sum(1 for char in text if char.isalpha() and char in "aeiou")
    total_letters = vowel + consonnant
    return round((consonnant/(vowel + consonnant)),3) if total_letters > 0 else 0

In [33]:
cmu_dict = cmudict.dict()

def get_word_stress(word):
    if word in cmu_dict:
        return sum(int(char) for syllable in cmu_dict[word][0] for char in syllable if char.isdigit())
    return 0

def get_sentence_stress(sentence):
    words = sentence.split()
    stress_values = [get_word_stress(word) for word in words]
    return sum(stress_values)

In [34]:
def redundance(text):
    # give a redundance score, considering the lenght of each text, if a lemmatized words appears more than three times the mean, it is considered redundant.

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    clean_tokens = [w for w in tokens if w not in stop_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in clean_tokens]

    word_counts = Counter(lemmatized_tokens)
    mean_freq = sum(word_counts.values()) / len(word_counts) if len(word_counts)!= 0 else 0

    if mean_freq != 0:
        score = sum(1 for word, count in word_counts.items() if count > 2.5 * mean_freq)
    else:
        score = 0

    return score

In [35]:
def sentiment_polarity(text):
    sent_pol = TextBlob(text).sentiment.polarity
    return abs(round(sent_pol,3))

In [36]:
def word_choice(text):
    common_ai_words =["commendable",'transhumanist', 'meticulous', 'elevate','hello', 'tapestry','leverage',
                  'journey', 'headache','resonate','testament','explore', 'binary','delve',
                  'enrich', 'seamless','multifaceted', 'sorry','foster', 'convey', 'beacon',
                  'interplay', 'oh', 'navigate','form','adhere','cannot', 'landscape','remember',
                  'paramount', 'comprehensive', 'placeholder','grammar','real','summary','symphony',
                  'furthermore','relationship','ultimately','profound','art','supercharge','evolve',
                  'beyoud','reimagine','vibrant', 'robust','pivotal','certainly','quinoa','orchestrate','align',
                  'diverse','recommend','annals','note','employ','bustling','indeed','digital','enigma', 'outfit',
                  'indelible','refrain','culture','treat','emerge','meticulous','esteemed','weight','whimsical','bespoke',
                  'highlight','antagonist','unlock','key','breakdown','tailor','misinformation','treasure','paradigm','captivate',
                  'song','underscore','calculate','especially','climate','hedging','inclusive','exercise','ai','embrace',
                  'level','nuance','career','dynamic','accent','ethos','cheap','firstly','online','goodbye'
                  ]
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    word_count = 0
    for word in text.split():
        if word in common_ai_words:
            word_count += 1

    return word_count

In [37]:
def coherence(text):
    # uses gensim to measure coherence, use the lsi model(latent semantic indexing, coherence c_v because we provide the text)
    tokens = word_tokenize(text)
    if not tokens:
        coherence_score = 0
    else:
        dictionary = corpora.Dictionary([tokens])
        corpus_gensim = [dictionary.doc2bow(tokens)]
        lsa_model = LsiModel(corpus_gensim, id2word=dictionary)

        coherence_model = CoherenceModel(
            model=lsa_model,
            texts=[tokens],
            dictionary=dictionary,
            coherence='c_v'
        )
        coherence_score = coherence_model.get_coherence()
    return coherence_score

In [38]:
def reading_ease(text):
    reading_ease= textstat.flesch_reading_ease(text)
    return reading_ease


def gunning_fog(text):
    gunning_fog = textstat.gunning_fog(text)
    return gunning_fog

In [39]:
text = "Hello is the cat cat cat name, it is commendable. The cat eat the cat fish that was in the bowl of the cat, the cat is a bad cat!"
print(f'word count :{word_count(text)}')
print(f'cleaned :{basic_cleaning(text)}')
print(f'consonnance density :{cons_density(text)}')
print(f'stress value :{get_sentence_stress(text)}')
print(f'redundance :{redundance(text)}')
print(f'sentiment :{sentiment_polarity(text)}')
print(f'unusual word count :{word_choice(text)}')
print(f'coherence :{coherence(text)}')
print(f'reading ease :{reading_ease(text)}')
print(f'gunning fog :{gunning_fog(text)}')

word count :30
cleaned :hello is the cat cat cat name  it is commendable  the cat eat the cat fish that was in the bowl of the cat  the cat is a bad cat 
consonnance density :0.625
stress value :17
redundance :1
sentiment :0.875
unusual word count :2
coherence :0.9999999999999998
reading ease :98.55
gunning fog :7.33


# Create Pipeline
We want to add columns, not transform them ==> no ColumnTransformer <br>
Function transformer?<br>
But firt we need to get our preprocessed data...

In [40]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer

In [41]:
data_load = pd.read_csv("/home/romaric/code/nghia95/fake-data-detector/data/1k_sampled_dataset.csv")
data = data_load.copy()
data.head()

Unnamed: 0,text,source,prompt_id,text_length,word_count
0,The Philosophy and Ethics of Transhumanism\n\n...,GPT-3.5,1920,2558,394
1,Crime-tracking app Citizen is launching its ow...,Flan-T5-XXL,0,378,62
2,The court in Novorossiysk gave two of the danc...,GLM-130B,0,621,109
3,"then drops the drumsticks, poses, then walks o...",GPT-J,0,513,90
4,On tally went to the beach. She found a sand d...,GPT-J,0,4984,846


In [42]:
data["AI_gen"] = data["source"].apply(lambda x: 0 if x == "Human" else 1)

In [93]:
data.tail()

Unnamed: 0,text,source,prompt_id,text_length,word_count,AI_gen
995,Please write a response to ONE of the prompts ...,Human,0,4259,790,0
996,Snap's next-gen Spectacles will be able to lay...,Human,0,437,68,0
997,Visual Place Recognition (VPR) is the ability ...,Flan-T5-XL,0,382,58,1
998,Addressing Ex-Felon Disenfranchisement Essay\n...,Human,0,32647,4997,0
999,"The antiques, valued at between PS2,000 and PS...",GPT-J,0,822,142,1


In [43]:
len(data.index)

1000

In [44]:
X=pd.DataFrame(data["text"])
y=data["AI_gen"]

## Test No parallelism

In [81]:
class InputHandler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, str):
            X = [X]
        if isinstance(X, list):
            X = pd.DataFrame({"text": X})
        elif isinstance(X, pd.DataFrame):
            if "text" not in X.columns:
                raise ValueError("Input DataFrame must have a 'text' column")
        else:
            X = pd.DataFrame({"text": list(X)})
        return X

class HowManyWords(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["word_count"]

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X["text"]
        word_c = X.apply(word_count)
        return pd.DataFrame({"word_count": word_c})

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["preprocessed"]

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X["text"]
        cleaned = X.apply(basic_cleaning)
        return pd.DataFrame({"preprocessed": cleaned})

class ConsDensity(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["cons_density"]

    def transform(self, X):
        return X["preprocessed"].apply(cons_density).values.reshape(-1, 1)

class Stress(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["stress_value"]

    def transform(self, X):
        return X["preprocessed"].apply(get_sentence_stress).values.reshape(-1, 1)

class Sentiment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["sentiment_score"]

    def transform(self, X):
        return X["preprocessed"].apply(sentiment_polarity).values.reshape(-1, 1)

class Redundance(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["redundance"]

    def transform(self, X):
        return X["preprocessed"].apply(redundance).values.reshape(-1, 1)

class UnusualWord(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["unusual_words"]

    def transform(self, X):
        return X["preprocessed"].apply(word_choice).values.reshape(-1, 1)

class Coherence(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["coherence"]

    def transform(self, X):
        return X["preprocessed"].apply(coherence).values.reshape(-1, 1)

class ReadingEase(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["reading_ease"]

    def transform(self, X):
        return X["text"].apply(reading_ease).values.reshape(-1, 1)

class GunningFog(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["gunning_fog"]

    def transform(self, X):
        return X["text"].apply(gunning_fog).values.reshape(-1, 1)


In [82]:
log_scaler = FunctionTransformer(lambda x: np.log1p(x), validate=True)

pipeline = Pipeline([
    ("input_handler", InputHandler()),
    ("union", FeatureUnion([
        ("preprocessed_features", Pipeline([
            ("preprocessor", TextPreprocessor()),
            ("features", FeatureUnion([
                ("cons_density", ConsDensity()),
                ("stress_value", Pipeline([
                    ("extract", Stress()),
                    ("scaler", MinMaxScaler())
                ])),
                ("sentiment_score", Sentiment()),
                ("redundance", Pipeline([
                    ("extract", Redundance()),
                    ("log_scaling", log_scaler)
                ])),
                ("unusualword", Pipeline([
                    ("extract", UnusualWord()),
                    ("log_scaling", log_scaler)
                ])),
                ("coherence", Coherence())
            ]))
        ])),
        ("original_text_features", Pipeline([
            ("features", FeatureUnion([
                ("wordcount", Pipeline([
                    ("extract", HowManyWords()),
                    ("scaler", MinMaxScaler())
                ])),
                ("readingease", Pipeline([
                    ("extract", ReadingEase()),
                    ("scaler", MinMaxScaler())
                ])),
                ("gunningfog", Pipeline([
                    ("extract", GunningFog()),
                    ("scaler", MinMaxScaler())
                ]))
            ]))
        ]))
    ]))
])


feature_names = [
    "cons_density", "stress_value", "sentiment_score",
    "redundance", "unusual_words", "coherence",
    "word_count", "reading_ease", "gunning_fog"
]

In [83]:
pipeline

In [49]:
X_processed = pipeline.fit_transform(X)
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

In [50]:
X_processed_df

Unnamed: 0,cons_density,stress_value,sentiment_score,redundance,unusual_words,coherence,word_count,reading_ease,gunning_fog
0,0.599,0.025421,0.066,1.609438,0.693147,0.251201,0.023936,0.607630,0.153073
1,0.613,0.002982,0.167,1.098612,0.000000,1.000000,0.002400,0.727166,0.065798
2,0.622,0.005113,0.087,0.693147,0.000000,1.000000,0.005449,0.801858,0.130602
3,0.626,0.005184,0.184,0.000000,0.000000,1.000000,0.004216,0.821002,0.082930
4,0.625,0.060640,0.024,2.564949,1.609438,0.196696,0.053256,0.796435,0.072874
...,...,...,...,...,...,...,...,...,...
995,0.619,0.053824,0.188,2.833213,1.945910,0.292397,0.049624,0.840147,0.067784
996,0.626,0.002840,0.212,0.693147,0.000000,1.000000,0.002789,0.705145,0.136934
997,0.607,0.002840,0.024,0.000000,0.000000,1.000000,0.002141,0.567926,0.227561
998,0.612,0.354896,0.088,4.094345,3.044522,0.228694,0.322522,0.670533,0.095717


In [51]:
X_processed_df.shape

(1000, 9)

## With parallelism (*not working at the moment*)

In [52]:
class InputHandler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, str):
            X = [X]
        return pd.DataFrame({"text": X})

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["preprocessed"]

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X["text"]
        cleaned = X.apply(basic_cleaning)
        return pd.DataFrame({"preprocessed": cleaned})

class ConsDensity(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["cons_density"]

    def transform(self, X):
        return X["preprocessed"].apply(cons_density).values.reshape(-1, 1)

class Stress(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["stress_value"]

    def transform(self, X):
        return X["preprocessed"].apply(get_sentence_stress).values.reshape(-1, 1)

class Sentiment(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["sentiment_score"]

    def transform(self, X):
        return X["preprocessed"].apply(sentiment_polarity).values.reshape(-1, 1)

class Redundance(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["redundance"]

    def transform(self, X):
        return X["preprocessed"].apply(redundance).values.reshape(-1, 1)

class UnusualWord(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["unusual_words"]

    def transform(self, X):
        return X["preprocessed"].apply(word_choice).values.reshape(-1, 1)

class Coherence(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["coherence"]

    def transform(self, X):
        return X["preprocessed"].apply(coherence).values.reshape(-1, 1)

class ReadingEase(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["reading_ease"]

    def transform(self, X):
        return X["text"].apply(reading_ease).values.reshape(-1, 1)

class GunningFog(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return ["gunning_fog"]

    def transform(self, X):
        return X["text"].apply(gunning_fog).values.reshape(-1, 1)


In [53]:
pipeline = Pipeline([
    ("input_handler", InputHandler()),
    ("union", FeatureUnion([
        ("preprocessed_features", Pipeline([
            ("preprocessor", TextPreprocessor()),
            ("features", FeatureUnion([
                ("cons_density", ConsDensity()),
                ("stress_value", Stress()),
                ("sentiment_score", Sentiment()),
                ("redundance", Redundance()),
                ("unusualword", UnusualWord()),
                ("coherence", Coherence())
            ]))
        ])),
        ("original_text_features", FeatureUnion([
            ("readingease", ReadingEase()),
            ("gunningfog", GunningFog())
        ]))
    ], n_jobs=-1))
])


feature_names = [
    "cons_density", "stress_value", "sentiment_score",
    "redundance", "unusual_words", "coherence",
    "reading_ease", "gunning_fog"
]

In [54]:
pipeline

In [55]:
X

Unnamed: 0,text
0,The Philosophy and Ethics of Transhumanism\n\n...
1,Crime-tracking app Citizen is launching its ow...
2,The court in Novorossiysk gave two of the danc...
3,"then drops the drumsticks, poses, then walks o..."
4,On tally went to the beach. She found a sand d...
...,...
995,Please write a response to ONE of the prompts ...
996,Snap's next-gen Spectacles will be able to lay...
997,Visual Place Recognition (VPR) is the ability ...
998,Addressing Ex-Felon Disenfranchisement Essay\n...


In [56]:
X_processed = pipeline.fit_transform(X)
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

ValueError: If using all scalar values, you must pass an index

In [None]:
processed_df.shape

(1, 8)

In [None]:
X_processed_df = pd.DataFrame(X_processed)
X_processed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,25.08,14.57,0.599,380.0,0.066,4.0,1.0,0.177397
1,50.43,7.54,0.613,64.0,0.167,2.0,0.0,1.0
2,66.27,12.76,0.622,94.0,0.087,1.0,0.0,1.0


In [None]:
X_processed_df.shape

(3, 8)

In [None]:
X_final = pd.concat([X, X_processed_df], axis=1)
X_final.shape

(3, 9)

# Model

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X_processed_df,y,train_size=0.7, random_state= 1, stratify= y)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(700, 9) (300, 9) (700,) (300,)


In [58]:
pip install tensorflow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [59]:
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [60]:
def initialize_model():
    model = models.Sequential()
    model.add(layers.Dense(50, activation = "relu", input_dim = X_processed_df.shape[1]))
    model.add(layers.Dense(50, activation = "relu"))
    model.add(layers.Dense(30, activation = "relu"))
    model.add(layers.Dense(10, activation = "relu"))
    model.add(layers.Dense(1, activation = "sigmoid"))

    return model

In [61]:
model = initialize_model()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [62]:
model.summary()

In [63]:
def compile_model(model):

    return  model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

In [64]:
es = EarlyStopping(patience = 20,restore_best_weights=True, monitor='val_loss')
compile_model(model)
history = model.fit(
    X_train, y_train,
    epochs=500,
    callbacks=[es],
    validation_split = 0.2
    )

Epoch 1/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.5168 - loss: 0.6910 - val_accuracy: 0.6786 - val_loss: 0.6114
Epoch 2/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7263 - loss: 0.5885 - val_accuracy: 0.6857 - val_loss: 0.5771
Epoch 3/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7459 - loss: 0.5290 - val_accuracy: 0.6857 - val_loss: 0.5737
Epoch 4/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7242 - loss: 0.5266 - val_accuracy: 0.6857 - val_loss: 0.5700
Epoch 5/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7212 - loss: 0.5269 - val_accuracy: 0.6857 - val_loss: 0.5531
Epoch 6/500
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7299 - loss: 0.4961 - val_accuracy: 0.6643 - val_loss: 0.5683
Epoch 7/500
[1m18/18[0m [32m━

In [65]:
baseline = model.evaluate(X_test,y_test)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7591 - loss: 0.5288 


In [66]:
precision = baseline[1]
precision

0.7699999809265137

In [127]:
from tensorflow.keras.models import save_model
from tensorflow.keras.models import load_model

In [128]:
# model.save(filepath=r"home\romaric\code\nghia95\fake-data-detector\notebooks\roma_models\baseline_model.keras")
# model = load_model("roma_models\baseline_model.keras")

In [124]:
X = "Romaric is super awesome!"

X_processed = pipeline.fit_transform(X)


In [125]:
prediction = model.predict(X_processed)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step


In [126]:
prediction

array([[0.79552174]], dtype=float32)