# ClearML Pipeline - Engineer Features

<img src="https://mikewlange.github.io/ai-or-human/images/generate_features_sm.drawio.png" alt="Alt Text" >

<span style="display: inline-block;padding: 10px;background-color: #f4f3ee;border: 1px solid #FF1493;border-radius: 4px;margin-bottom: 10px;line-height: 1.5;color: #333;" class="tip">In the <b>Feature Engineering</b> section of this notebook, we focus on extracting attributes from the essay text data to enhance our model's interpretability and performance. By engineering features meticulously, we target capturing nuanced differences in textual characteristics, such as readability, semantic density, and syntactic patterns, distinguishing between AI-generated and human-written texts.</span>

We create the following:

1.  **Readability Scores**: Analyzing scores like `Flesch-Kincaid Grade Level`, `Gunning Fog Index`, etc., to identify unique readability patterns in AI-generated versus human-written essays.
    
2.  **Semantic Density**: Examining the concentration of meaning-bearing words in texts to understand the distribution differences in AI-generated text.
    
3.  **Semantic Flow Variability**: Investigating idea transitions between sentences, comparing human writing's variability with AI-generated text.
    
4.  **Psycholinguistic Features**: Using the LIWC tool to evaluate the psychological and emotional content of the essays.
    
5.  **Textual Entropy**: Measuring unpredictability or randomness in text, focusing on how AI-generated content's entropy differs from human writing.
    
6.  **Syntactic Tree Patterns**: Parsing essays into syntactic trees to analyze pattern frequencies, particularly the structural tendencies in language models.

**We work hard not to generate a discriminative content bias and only stick to mathematical features. Can the LWIC(empath) features introduce bias? Maybe?**

In [2]:
# %pip install clearml -q
# %pip install textstat -q
# %pip install empath -q
# %pip install spacy -q
# %pip install benepar -q

%env CLEARML_WEB_HOST=https://app.clear.ml
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
%env CLEARML_API_ACCESS_KEY=
%env CLEARML_API_SECRET_KEY=

class CFG:
    CLEAR_ML = True
    DATA_ETL_STRATEGY = 2
    TRAINING_DATA_COUNT = 2000
    DATA_SCALE_RATIO = 0.24
    TEST_RUN = True
    CLEARML_OFFLINE_MODE = False
    CLEARML_ON = False
    SCRATCH_PATH = 'scratch'
    ARTIFACTS_PATH = 'artifacts'
    ENSAMBLE_STRATEGY = 1


env: CLEARML_WEB_HOST=https://app.clear.ml
env: CLEARML_API_HOST=https://api.clear.ml
env: CLEARML_FILES_HOST=https://files.clear.ml
env: CLEARML_API_ACCESS_KEY=
env: CLEARML_API_SECRET_KEY=


<img src="scratch/features_pipeline.png" alt="Alt Text" width="800">

In [2]:
'''Injest Data'''
from clearml import Task, PipelineDecorator
import os
import logging
import textstat
import pandas as pd
from clearml import PipelineDecorator, TaskTypes
from clearml.automation.controller import PipelineController

from clearml import Task

@PipelineDecorator.component(return_values=["df_essays"], name='Pull Training Data', cache=True, task_type=TaskTypes.data_processing)
def download_dataset_as_dataframe(new_dataset_name, dataset_project='LLM-detect-ai-gen-text/datasets', file_name="dataset.pkl"):
    import pandas as pd
    # import Dataset from clearml
    from clearml import Dataset
    import os

    dataset = Dataset.get(dataset_project=dataset_project, dataset_name=new_dataset_name, only_completed=True)
    cached_folder = dataset.get_local_copy()
    for file_name in os.listdir(cached_folder):
        if file_name.endswith('.pkl'):
            file_path = os.path.join(cached_folder, file_name)
            dataframe = pd.read_pickle(file_path)
            # Convert the columns to the desired data types
            dataframe['text'] = dataframe['text'].astype(str)
            dataframe['label'] = dataframe['label'].astype(int)
            dataframe['source'] = dataframe['source'].astype(str)
            # remove missing rows from dataframe
            df_essays = dataframe.dropna()
            return df_essays.reset_index(drop=True)

# Load the dataset into a Pandas DataFrame

'''Clean Data'''
# Function to preprocess text
def pipeline_etl_clean_data(df):
    import logging
    import markdown
    from bs4 import BeautifulSoup
    import re
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    import nltk

    # Download necessary NLTK packages
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    # Ensure the necessary NLTK packages are downloaded
    try:
        nltk.download('punkt', quiet=True)
        nltk.download('wordnet', quiet=True)
    except Exception as e:
        logging.error(f"An error occurred while downloading NLTK packages: {e}")

    # Function to remove markdown formatting
    def remove_markdown(text):
        try:
            html = markdown.markdown(text)
            soup = BeautifulSoup(html, features="html.parser")
            return soup.get_text()
        except Exception as e:
            logging.error(f"Error in remove_markdown: {e}")
            return text

    # Function to remove 'Task' prefix from the prompt
    def remove_task_on_prompt(text):
        try:
            pattern = r'^(?:Task(?:\s*\d+)?\.?\s*)?'
            return re.sub(pattern, '', text)
        except Exception as e:
            logging.error(f"Error in remove_task_on_prompt: {e}")
            return text

    # Function to replace newline and carriage return characters
    def replace_newlines(text):
        try:
            return re.sub(r'[\n\r]+', ' ', text)
        except Exception as e:
            logging.error(f"Error in replace_newlines: {e}")
            return text

    # Function to remove extra whitespaces
    def remove_extra_whitespace(text):
        try:
            return ' '.join(text.split())
        except Exception as e:
            logging.error(f"Error in remove_extra_whitespace: {e}")
            return text

    # Function to remove punctuation except for specified characters
    def remove_punctuation_except(text, punctuation_to_retain):
        try:
            punctuation_to_remove = r'[^\w\s' + re.escape(punctuation_to_retain) + ']'
            return re.sub(punctuation_to_remove, '', text)
        except Exception as e:
            logging.error(f"Error in remove_punctuation_except: {e}")
            return text

    def remove_emojis_and_newlines(text):
        # Regex pattern for matching emojis
        emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002500-\U00002BEF"  # chinese characters
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010FFFF"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"  # dingbats
                            u"\u3030"
                            "]+", flags=re.UNICODE)

        # Remove newline characters
        text = re.sub('\n+', ' ', text)
        # Remove emojis
        text = emoji_pattern.sub(r'', text)
        return text
    
    def replace_newlines(text):
        return re.sub(r'[\r\n]+', ' ', text)
    # Function to tokenize and lemmatize text
    def process_text(text):
        try:
            tokens = word_tokenize(text)
            lemmatizer = WordNetLemmatizer()
            lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
            return lemmatized_tokens
        except Exception as e:
            logging.error(f"Error in process_text: {e}")
            return []
    # Main preprocessing logic
    try:
        PUNCTUATION_TO_RETAIN = '.?!,'  # Punctuation characters to retain    
        for index, row in df.iterrows():
            text = row['text']
            text = remove_markdown(text)
            text = replace_newlines(text)
            text = remove_extra_whitespace(text)
            text = remove_task_on_prompt(text)
            text = remove_punctuation_except(text, PUNCTUATION_TO_RETAIN)
            #text = remove_emojis_and_newlines(text)
            text = re.sub('\n+', '', text)
            text = re.sub(r'[A-Z]+_[A-Z]+', '', text)
            text = replace_newlines(text)
            # Remove occurrences of \n\n from the text
            # text = text.replace('\n\n', '')
            tokens = process_text(text)
            preprocessed_text = ' '.join(tokens)
            
            # Update the 'preprocessed_text' column with the processed text
            df.at[index, 'text'] = preprocessed_text
        df_essays = pd.DataFrame(df)
        return df_essays
    except Exception as e:
        logging.error(f"Error in preprocess_text: {e}")


'''GENERATE FEATURES'''
# =============================================================================
# =============================================================================

'''Readability Scores'''
# =============================================================================
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def apply_textstat_function(df, column_name, function_to_apply):
    import logging
    import textstat
    try:
        df.loc[:, column_name] = df.loc[:, 'text'].apply(function_to_apply)
        logging.info(f"Function {function_to_apply.__name__} applied to column {column_name}")
        return df
    except Exception as e:
        logging.error(e)

@PipelineDecorator.component(return_values=["df_readability_essays"], name='Readability Scores - Features Pipeline', 
                             cache=True, task_type=TaskTypes.data_processing)       
def process_readability_scores(df_essays):
    import logging
    import textstat

    try:
        # Calculate readability scores
        print(df_essays.info())
        print(df_essays.shape)
        df_essays['flesch_kincaid_grade'] = df_essays['text'].apply(textstat.flesch_kincaid_grade)
        df_essays['gunning_fog'] = df_essays['text'].apply(textstat.gunning_fog)
        df_essays['coleman_liau_index'] = df_essays['text'].apply(textstat.coleman_liau_index)
        df_essays['smog_index'] = df_essays['text'].apply(textstat.smog_index)
        df_essays['ari'] = df_essays['text'].apply(textstat.automated_readability_index)
        df_essays['dale_chall'] = df_essays['text'].apply(textstat.dale_chall_readability_score)
        df_readability_essays = df_essays
        return df_readability_essays

    except Exception as e:
        logging.error(f"Error in process_readability_scores: {e}")
        raise


'''Semantic Density'''
# =============================================================================



@PipelineDecorator.component(return_values=["df_semantic_essays"], name='Semantic Density - Features Pipeline', 
                             cache=True, task_type=TaskTypes.data_processing)  
def process_semantic_density(df_essays):
    import logging
    import numpy as np
    import pandas as pd
    import nltk
    from nltk.tokenize import word_tokenize
    import string
    from sentence_transformers import SentenceTransformer

    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    # Ensure that the necessary NLTK models are downloaded
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    nltk.download('stopwords', quiet=True)

    def get_meaning_bearing_tags():
        return {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}

    def tokenize_text(text):
        try:
            return word_tokenize(text.lower())
        except TypeError as e:
            logging.error(f"Error tokenizing text: {e}")
            return []

    def tag_words(words):
        try:
            return nltk.pos_tag(words)
        except Exception as e:
            logging.error(f"Error tagging words: {e}")
            return []

    def filter_words(tokens):
        return [token for token in tokens if token.isalpha() or token in string.punctuation]

    mb_tags = get_meaning_bearing_tags()

    df_essays['semantic_density'] = 0
    df_essays['text_tagged_nltk'] = ""

    def process_row(row):
        index, data = row
        text = data['text']
        tokens = tokenize_text(text)
        words = filter_words(tokens)
        tagged = tag_words(words)
        mb_words = [word for word, tag in tagged if tag in mb_tags]
        full_sentence = " ".join(word + "/" + tag for word, tag in tagged)
        density = len(mb_words) / len(words) if words else 0
        data['semantic_density'] = density
        data['text_tagged_nltk'] = full_sentence
        return index, data

    processed_rows = map(process_row, df_essays.iterrows())

    df_semantic_essays = pd.DataFrame.from_dict(dict(processed_rows), orient='index')
    return df_semantic_essays



'''Semantic Flow Variability'''
# =============================================================================


@PipelineDecorator.component(return_values=["df_semantic_essays"], name='Semantic Flow Variability - Features Pipeline', 
                             cache=True, task_type=TaskTypes.data_processing)  
def process_semantic_flow_variability(df):
    import logging
    import numpy as np
    import pandas as pd
    import nltk
    from sentence_transformers import SentenceTransformer
    import concurrent.futures
    # Configure logging
    """
    Process a DataFrame to calculate Semantic Flow Variability for each text entry.

    Semantic Flow Variability is calculated by measuring the cosine similarity between
    sentence embeddings of consecutive sentences in a text. It's a measure of how varied
    the semantic content is across the text.

    Args:
        df (pandas.DataFrame): DataFrame containing a 'text' column.

    Returns:
        pandas.DataFrame: The input DataFrame with an additional column 'semantic_flow_variability'.
    """

    # logging.basicConfig(level=logging.INFO,
    #                     format='%(asctime)s - %(levelname)s - %(message)s')
    # logger = logging.getLogger(__name__)

    # Load a pre-trained sentence transformer model
    model_MiniLM = 'sentence-transformers/all-MiniLM-L6-v2'

    try:
        model = SentenceTransformer(model_MiniLM)
    except Exception as e:
        logging.error(f"Error loading the sentence transformer model: {e}")
        model = None

    def cosine_similarity(v1, v2):
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

    def semantic_flow_variability(text):
        if not model:
            logging.error(
                "Model not loaded. Cannot compute semantic flow variability.")
            return np.nan

        try:
            # Split the text into sentences
            sentences = nltk.sent_tokenize(text)
            if len(sentences) < 2:
                logging.info(
                    "Not enough sentences for variability calculation.")
                return 0

            # Generate embeddings for each sentence
            sentence_embeddings = model.encode(
                sentences, convert_to_tensor=True, show_progress_bar=False)

            # Move embeddings to CPU and convert to numpy - this is necessary for the next step
            sentence_embeddings = sentence_embeddings.cpu().numpy()

            # Calculate cosine similarity between consecutive sentences
            similarities = [cosine_similarity(sentence_embeddings[i], sentence_embeddings[i+1])
                            for i in range(len(sentence_embeddings)-1)]

            # Return the standard deviation of the similarities as a measure of variability
            return np.std(similarities)
        except Exception as e:
            logging.error(f"Error calculating semantic flow variability: {e}")
            return np.nan

    if df is not None and 'text' in df:
        # Use concurrent processing for parallel execution
        with concurrent.futures.ThreadPoolExecutor() as executor:
            df['semantic_flow_variability'] = list(
                executor.map(semantic_flow_variability, df['text']))
    else:
        logging.error("Invalid DataFrame or missing 'text' column.")

    df_semantic_essays = df
    return df_semantic_essays


'''Psycholinguistic Features'''
# =============================================================================


@PipelineDecorator.component(return_values=["df_psyco_essays"], name='Psycholinguistic Features - Features Pipeline',
                             cache=True, task_type=TaskTypes.data_processing)
def apply_empath_analysis(df, text_column='text'):
    import pandas as pd
    import logging
    from empath import Empath

    # Initialize logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    """
    Apply Empath analysis to a DataFrame column, expanding results into separate columns.

    Empath analysis interprets the text for various emotional and thematic elements,
    returning a dictionary of categories and their respective scores. This function
    applies the analysis to a specified column of a DataFrame and expands the results
    into separate columns.

    Args:
        df (pandas.DataFrame): The DataFrame to analyze.
        text_column (str): The name of the column containing text to analyze.

    Returns:
        pandas.DataFrame: The original DataFrame with added columns for Empath analysis results.
    """
    lexicon = Empath()

    def empath_analysis(text):
        try:
            return lexicon.analyze(text, normalize=True)
        except Exception as e:
            logger.error(f"Error during Empath analysis: {e}")
            return {}

    try:
        # Expanding Empath analysis results into separate columns
        df_psyco_essays = df
        empath_results = df_psyco_essays[text_column].apply(
            empath_analysis).apply(pd.Series)
        df_psyco_essays = df_psyco_essays.concat([df_psyco_essays, empath_results], axis=1).drop(
            columns=['empath_analysis'])
        return df_psyco_essays
    except Exception as e:
        logger.error(f"Error applying Empath analysis to DataFrame: {e}")
        return df_psyco_essays


'''Textrual Entropy'''
# =============================================================================


@PipelineDecorator.component(return_values=["df_essays"], name='Textual Entropy - Features Pipeline',
                             cache=True, task_type=TaskTypes.data_processing)
def process_textual_entropy(df):
    import numpy as np
    from collections import Counter
    import logging
    import pandas as pd
    
    # Configure logging
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s - %(levelname)s - %(message)s")
    logger = logging.getLogger(__name__)
    """
    Calculate the Shannon entropy of a text string.

    Entropy is calculated by first determining the frequency distribution
    of the characters in the text, and then using these frequencies to 
    calculate the probabilities of each character. The Shannon entropy 
    is the negative sum of the product of probabilities and their log2 values.

    Args:
        text (str): The text string to calculate entropy for.

    Returns:
        float: The calculated entropy of the text, or 0 if text is empty/non-string.
        None: In case of an exception during calculation.
    """
    
    def calc_entropy(text):
        freq_dist = Counter(text)
        probs = [freq / len(text) for freq in freq_dist.values()]
        # Calculate entropy, avoiding log2(0)
        entropy = -sum(p * np.log2(p) for p in probs if p > 0)
        return entropy
    try:
        if not isinstance(df, pd.DataFrame):
            logger.warning("Input is not a DataFrame.")
            return None

        # Loop through each row and apply the function to 'text' column
        
        df["textual_entropy"] = df["text"].apply(calc_entropy)
        df_entropy = df
        return df_entropy
    except Exception as e:
        logger.error(f"Error calculating entropy: {e}")
        return None


'''Syntactic Tree Patterns'''
# =============================================================================
# Configure logging


@PipelineDecorator.component(return_values=["df_essays"], name='Syntactic Tree Patterns - Features Pipeline',
                             cache=True, task_type=TaskTypes.data_processing)
def process_syntactic_tree_patterns(df_essays):
    """
    Process a DataFrame containing essays to extract various syntactic tree pattern features.

    The function uses spaCy, benepar, and NLTK to analyze syntactic structures of text,
    calculating various metrics such as tree depth, branching factors, nodes, leaves,
    and production rules. It also includes text analysis features like token length,
    sentence length, and entity analysis.

    Args:
        df_essays (pandas.DataFrame): DataFrame containing a 'text' column with essays.

    Returns:
        pandas.DataFrame: DataFrame with additional columns for each extracted syntactic and textual feature.
    """
    import spacy
    import benepar
    import numpy as np
    import pandas as pd
    import logging
    from collections import Counter
    from nltk import Tree
    from transformers import T5TokenizerFast
    from tqdm import tqdm
    tqdm.pandas()
    import time
    # Start time
    #start_time = time.time()
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    import traceback

    start_time = time.time()
    """
    Process a DataFrame containing essays to extract various syntactic tree pattern features.

    The function uses spaCy, benepar, and NLTK to analyze syntactic structures of text,
    calculating various metrics such as tree depth, branching factors, nodes, leaves,
    and production rules. It also includes text analysis features like token length,
    sentence length, and entity analysis.

    Args:
        df_essays (pandas.DataFrame): DataFrame containing a 'text' column with essays.

    Returns:
        pandas.DataFrame: DataFrame with additional columns for each extracted syntactic and textual feature.
    """
    tokenizer = T5TokenizerFast.from_pretrained('t5-base', model_max_length=512, validate_args=False)
    try:
        nlp = spacy.load('en_core_web_lg')
        if spacy.__version__.startswith('2'):
            benepar.download('benepar_en3')
            nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
        else:
            nlp.add_pipe("benepar", config={"model": "benepar_en3"})
    except Exception as e:
        logger.error(f"Failed to load spaCy model: {e}")
        return df_essays

    # Define helper functions for tree analysis...
    # (include spacy_to_nltk_tree, tree_depth, tree_branching_factor, count_nodes, count_leaves, etc.)
    def spacy_to_nltk_tree(node):
        if node.n_lefts + node.n_rights > 0:
            return Tree(node.orth_, [spacy_to_nltk_tree(child) for child in node.children])
        else:
            return node.orth_

    def tree_depth(node):
        if not isinstance(node, Tree):
            return 0
        else:
            return 1 + max(tree_depth(child) for child in node)

    def tree_branching_factor(node):
        if not isinstance(node, Tree):
            return 0
        else:
            return len(node)

    def count_nodes(node):
        if not isinstance(node, Tree):
            return 1
        else:
            return 1 + sum(count_nodes(child) for child in node)

    def count_leaves(node):
        if not isinstance(node, Tree):
            return 1
        else:
            return sum(count_leaves(child) for child in node)

    def production_rules(node):
        rules = []
        if isinstance(node, Tree):
            rules.append(node.label())
            for child in node:
                rules.extend(production_rules(child))
        return rules

    def count_labels_in_tree(tree, label):
        if not isinstance(tree, Tree):
            return 0
        count = 1 if tree.label() == label else 0
        for subtree in tree:
            count += count_labels_in_tree(subtree, label)
        return count

    def count_phrases_by_label(trees, label, doc):
        if label == 'NP':
            noun_phrases = [chunk.text for chunk in doc.noun_chunks]
            return noun_phrases
        else:
            return sum(count_labels_in_tree(tree, label) for tree in trees if isinstance(tree, Tree))

    def count_subtrees_by_label(trees, label):
        return sum(count_labels_in_tree(tree, label) for tree in trees if isinstance(tree, Tree))

    def average_phrase_length(trees):
        lengths = [len(tree.leaves()) for tree in trees if isinstance(tree, Tree)]
        return np.mean(lengths) if lengths else 0

    def subtree_height(tree, side):
        if not isinstance(tree, Tree) or not tree:
            return 0
        if side == 'left':
            return 1 + subtree_height(tree[0], side)
        elif side == 'right':
            return 1 + subtree_height(tree[-1], side)

    def average_subtree_height(trees):
        heights = [tree_depth(tree) for tree in trees if isinstance(tree, Tree)]
        return np.mean(heights) if heights else 0

    def pos_tag_distribution(trees):
        pos_tags = [tag for tree in trees for word, tag in tree.pos()]
        return Counter(pos_tags)

    def process_tree_or_string(obj):
        if isinstance(obj, Tree):
            return obj.height()
        else:
            return None

    def syntactic_ngrams(tree):
        ngrams = []
        if isinstance(tree, Tree):
            ngrams.extend(list(nltk.ngrams(tree.pos(), 2)))
        return ngrams
    
    # Process each essay and extract features
    for index, row in df_essays.iterrows():
        text = row['text']
        try:
            doc = nlp(text)
            trees = [spacy_to_nltk_tree(sent.root) for sent in doc.sents if len(tokenizer.tokenize(sent.text)) < 512]
            trees = [tree for tree in trees if isinstance(tree, Tree)]

            # Extract features
            depths = [tree_depth(tree) for tree in trees if isinstance(tree, Tree)]
            branching_factors = [tree_branching_factor(tree) for tree in trees if isinstance(tree, Tree)]
            nodes = [count_nodes(tree) for tree in trees if isinstance(tree, Tree)]
            leaves = [count_leaves(tree) for tree in trees if isinstance(tree, Tree)]
            rules = [production_rules(tree) for tree in trees if isinstance(tree, Tree)]
            rule_counts = Counter([rule for sublist in rules for rule in sublist])

            # Text analysis features
            num_sentences = len(list(doc.sents))
            num_tokens = len(doc)
            unique_lemmas = set([token.lemma_ for token in doc])
            total_token_length = sum(len(token.text) for token in doc)
            average_token_length = total_token_length / num_tokens if num_tokens > 0 else 0
            average_sentence_length = num_tokens / num_sentences if num_sentences > 0 else 0
            num_entities = len(doc.ents)
            num_noun_chunks = len(list(doc.noun_chunks))
            pos_tags = [token.pos_ for token in doc]
            num_pos_tags = len(set(pos_tags))
            distinct_entities = set([ent.text for ent in doc.ents])
            total_entity_length = sum(len(ent.text) for ent in doc.ents)
            average_entity_length = total_entity_length / num_entities if num_entities > 0 else 0
            total_noun_chunk_length = sum(len(chunk.text) for chunk in doc.noun_chunks)
            average_noun_chunk_length = total_noun_chunk_length / num_noun_chunks if num_noun_chunks > 0 else 0
            ngrams = []
            for tree in trees:
                ngrams.extend(syntactic_ngrams(tree))

            # Assign calculated feature values to the DataFrame
            # Assign calculated feature values to the DataFrame
            df_essays.at[index, 'num_sentences'] = num_sentences
            df_essays.at[index, 'num_tokens'] = num_tokens
            df_essays.at[index, 'num_unique_lemmas'] = len(unique_lemmas)
            df_essays.at[index, 'average_token_length'] = average_token_length
            df_essays.at[index, 'average_sentence_length'] = average_sentence_length
            df_essays.at[index, 'num_entities'] = num_entities
            df_essays.at[index, 'num_noun_chunks'] = num_noun_chunks
            df_essays.at[index, 'num_pos_tags'] = num_pos_tags
            df_essays.at[index, 'num_distinct_entities'] = len(distinct_entities)
            df_essays.at[index, 'average_entity_length'] = average_entity_length
            df_essays.at[index, 'average_noun_chunk_length'] = average_noun_chunk_length
            df_essays.at[index, 'max_depth'] = max(depths) if depths else 0
            df_essays.at[index, 'avg_branching_factor'] = np.mean(branching_factors) if branching_factors else 0
            df_essays.at[index, 'total_nodes'] = sum(nodes)
            df_essays.at[index, 'total_leaves'] = sum(leaves)
            df_essays.at[index, 'unique_rules'] = len(rule_counts)
            df_essays.at[index, 'most_common_rule'] = rule_counts.most_common(1)[0][0] if rule_counts else None
            df_essays.at[index, 'tree_complexity'] = sum(nodes) / sum(leaves) if leaves else 0
            df_essays.at[index, 'depth_variability'] = np.std(depths)
            #df_essays.at[index, 'subtree_freq_dist'] = Counter([' '.join(node.leaves()) for tree in trees for node in tree.subtrees() if isinstance(node, Tree)])
            df_essays.at[index, 'tree_height_variability'] = np.std([subtree_height(tree, 'left') for tree in trees if isinstance(tree, Tree)])
            
            #df_essays.at[index, 'pos_tag_dist'] = pos_tag_distribution(trees)
            #df_essays.at[index, 'syntactic_ngrams'] = ngrams

        except Exception as e:
            logger.error(f"Error processing text: {e}")
            traceback.print_exc()
            # Assign NaNs in case of error
            # df_essays.at[index, 'num_sentences'] = np.nan
            # ... Assign NaNs for other features ...

    return df_essays


@PipelineDecorator.component(name='Upload Generated Features Data',
                             cache=True, task_type=TaskTypes.data_processing)
def upload_dataset_from_dataframe(df, new_dataset_name, description="", tags=[], file_name="dataset.pkl"):
    from pathlib import Path
    from clearml import Dataset
    import pandas as pd
    import logging
    try:
      logging.basicConfig(level=logging.DEBUG)
      print(df.head())
      file_path = Path(file_name)
      pd.to_pickle(df, file_path)
      new_dataset = Dataset.create(dataset_project='LLM-detect-ai-gen-text-LIVE/dev/training_data', dataset_name=new_dataset_name)
      new_dataset.add_files(str(file_path))
      if description:
          new_dataset.set_description(description)
      if tags:
          new_dataset.add_tags(tags)
      new_dataset.upload()
      new_dataset.finalize()
    except Exception as e:
      logging.debug(e)

    print(f"New dataset '{new_dataset_name}' uploaded and finalized with description and tags.")

'''Create Pipeline'''
@PipelineDecorator.pipeline(name="Preprocessing Pipeline - Feature Generation", project="LLM-detect-ai-gen-text-LIVE/dev/features_pipeline")
def execute_features_pipeline(dataset_project='LLM-detect-ai-gen-text/datasets', file_name="training_sample.pkl"):
    import logging
    import numpy as np
    import pandas as pd
    
    # Download the dataset to use manually. Can do this via a ClearML DataSet or a Local File
    df_essays = download_dataset_as_dataframe('training_sample', dataset_project, file_name)
    
    # Sample just for this test 
    print("df_essays:\n" + df_essays.head().to_string())
    print(df_essays.info())
    
    sampled_df = pd.concat([
        df_essays[df_essays['label'] == 1].sample(n=500, replace=True, random_state=42),
        df_essays[df_essays['label'] == 0].sample(n=500, replace=True, random_state=42)
    ]).dropna(subset=['text'])
    
    print("df_essays:\n" + sampled_df.head().to_string())
    print(sampled_df.info())
    
    df_essays = sampled_df
    print("df_essays:\n" + df_essays.head().to_string())
    print(df_essays.info())

    df_essays = pipeline_etl_clean_data(df_essays)

    '''Readability Scores'''
    # =============================================================================
    df_readability_essays = process_readability_scores(df_essays)

    '''Semantic Density'''
    # =============================================================================
    df_semantic_essays = process_semantic_density(df_readability_essays)

    '''Semantic Flow Variability'''
    # =============================================================================
    df_variability_essays = process_semantic_flow_variability(df_semantic_essays)

    '''Psycholuigustic Features'''
    # =============================================================================
    df_psyco_essays = apply_empath_analysis(df_variability_essays)

    '''Textrual Entropy'''
   # =============================================================================
    df_entropy = process_textual_entropy(df_psyco_essays)

    '''Syntactic Tree Patterns'''
    # =============================================================================
    df_essays = process_syntactic_tree_patterns(df_psyco_essays)
    
    df_essays.to_pickle(f'{CFG.SCRATCH_PATH}/training_data/training_features_data.pkl')
    
    '''Save Features'''
    # =============================================================================
    upload_dataset_from_dataframe(df_essays, "training_feature_data",
                                  "Training Feature Data", ["features", "training", "cleaned"],
                                  f'{CFG.SCRATCH_PATH}/training_data/training_features_data.pkl')
    print("process completed")
    
if __name__ == "__main__":
    # This pipeline runs on a DataFrame of essays. 
    dataset_project='LLM-detect-ai-gen-text/datasets'
    file_name="training_sample.pkl"
    PipelineDecorator.run_locally()
    execute_features_pipeline(dataset_project='LLM-detect-ai-gen-text/datasets',file_name="training_sample.pkl")
    print("process completed")


ClearML Task: created new task id=96b106ef90584c4c83ffb31e161904de
2024-01-03 12:52:05,858 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/05628220ba2f4a13b629cdaf968e304d/experiments/96b106ef90584c4c83ffb31e161904de/output/log


2024-01-03 12:52:08,774 - INFO - No repository found, storing script code instead


ClearML pipeline page: https://app.clear.ml/pipelines/05628220ba2f4a13b629cdaf968e304d/experiments/96b106ef90584c4c83ffb31e161904de


2024-01-03 12:52:14,176 - INFO - No repository found, storing script code instead
2024-01-03 12:52:18,922 - INFO - No repository found, storing script code instead
2024-01-03 12:52:23,633 - INFO - No repository found, storing script code instead
2024-01-03 12:52:27,826 - INFO - No repository found, storing script code instead
2024-01-03 12:52:32,302 - INFO - No repository found, storing script code instead
2024-01-03 12:52:36,741 - INFO - No repository found, storing script code instead
2024-01-03 12:52:41,825 - INFO - No repository found, storing script code instead


Launching step [Pull Training Data]
df_essays:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

## Extract Notbook Content 

In [1]:
# import nbformat

# def extract_headers_and_text(notebook_path):
#     # Load the notebook
#     notebook = nbformat.read(notebook_path, as_version=4)

#     headers_and_text = []

#     # Iterate over the cells
#     for cell in notebook.cells:
#         # Check if the cell type is 'markdown'
#         if cell['cell_type'] == 'markdown':
#             # Extract the content
#             headers_and_text.append(cell['source'])

#     return headers_and_text

# Use the function
#headers_and_text = extract_headers_and_text('ai-or-human-full.ipynb')
# for text in headers_and_text:
#     print(text)


## For Speed

In [None]:
import pandas as pd
import logging
import textstat
import markdown
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from empath import Empath
import numpy as np
from collections import Counter
import spacy
import benepar
from nltk import Tree
from transformers import T5TokenizerFast
from tqdm import tqdm

class TextProcessingPipeline:
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.df_essays = None
        self.initialize_nlp_tools()

    def initialize_nlp_tools(self):
        nltk.download('punkt', quiet=True)
        nltk.download('wordnet', quiet=True)
        nltk.download('averaged_perceptron_tagger', quiet=True)
        nltk.download('stopwords', quiet=True)
        self.nlp = spacy.load('en_core_web_lg')
        self.nlp.add_pipe("benepar", config={"model": "benepar_en3"})
        self.tokenizer = T5TokenizerFast.from_pretrained('t5-base', model_max_length=512, validate_args=False)

    # Define other methods (e.g., remove_markdown, process_text, process_readability_scores, etc.)

    def run_pipeline(self):
        # Load dataset
        self.df_essays = pd.read_pickle(self.dataset_path)

        # Clean data
        self.df_essays = self.pipeline_etl_clean_data(self.df_essays)

        # Generate features
        self.df_essays = self.process_readability_scores(self.df_essays)
        self.df_essays = self.process_semantic_density(self.df_essays)
        self.df_essays = self.process_semantic_flow_variability(self.df_essays)
        self.df_essays = self.apply_empath_analysis(self.df_essays)
        self.df_essays = self.process_textual_entropy(self.df_essays)
        self.df_essays = self.process_syntactic_tree_patterns(self.df_essays)

        # Save processed data
        self.df_essays.to_pickle('processed_data.pkl')

    # Implement other processing methods here

if __name__ == "__main__":
    pipeline = TextProcessingPipeline('path_to_your_dataset.pkl')
    pipeline.run_pipeline()
