# Connecting and Uploading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd drive/MyDrive/Colab/LLMs Project

In [None]:
#loading:
import pandas as pd

Dataset      = pd.read_pickle("Dataset_generated_texts_gpt3_09.13.2025.pkl")
Dataset_lora = pd.read_pickle("Dataset_generated_texts_gpt3_lora_09.13.2025.pkl")

In [None]:
print(Dataset.shape)
Dataset.columns

In [None]:
print(Dataset_lora.shape)
Dataset_lora.columns

# Running CoreNLP

In [None]:
#loading:
import pandas as pd

generated_texts_gpt3      = pd.read_csv("generated_texts_gpt3.csv")
generated_texts_gpt3_lora = pd.read_csv("generated_texts_gpt3_lora.csv")

In [None]:
# Cleaning:

In [None]:
# 1) Rename "Text" -> "Taged_Text"
generated_texts_gpt3 = generated_texts_gpt3.rename(columns={"Text": "Taged_Text"})

# 2) Create "Label" from the leading <number>
generated_texts_gpt3["Label"] = (
    generated_texts_gpt3["Taged_Text"]
    .str.extract(r'^\s*<\s*(\d+)\s*>')        # grab the number at start like <0>
    .astype("Int64")                           # nullable integer dtype
)

# 3) Map label -> author name
author_map = {
    0: "Charles Dickens",
    1: "Mark Twain",
    2: "Herman Melville",
    3: "Jane Austen",
    4: "Louisa May Alcott",
}
generated_texts_gpt3["Author"] = generated_texts_gpt3["Label"].map(author_map)

# 4) Make a cleaned "Text" column:
#    - remove the leading <number> tag
#    - remove "<end>" and everything after it (case-insensitive)
#    - trim whitespace
generated_texts_gpt3["Text"] = (
    generated_texts_gpt3["Taged_Text"]
    .str.replace(r'^\s*<\s*\d+\s*>\s*', '', regex=True)
    .str.replace(r'\s*<end>.*$', '', regex=True, case=False)
    .str.strip()
)

In [None]:
# 1) Rename "Text" -> "Taged_Text"
generated_texts_gpt3_lora = generated_texts_gpt3_lora.rename(columns={"Text": "Taged_Text"})

# 2) Create "Label" from the leading <number>
generated_texts_gpt3_lora["Label"] = (
    generated_texts_gpt3_lora["Taged_Text"]
    .str.extract(r'^\s*<\s*(\d+)\s*>')        # grab the number at start like <0>
    .astype("Int64")                           # nullable integer dtype
)

# 3) Map label -> author name
author_map = {
    0: "Charles Dickens",
    1: "Mark Twain",
    2: "Herman Melville",
    3: "Jane Austen",
    4: "Louisa May Alcott",
}
generated_texts_gpt3_lora["Author"] = generated_texts_gpt3_lora["Label"].map(author_map)

# 4) Make a cleaned "Text" column:
#    - remove the leading <number> tag
#    - remove "<end>" and everything after it (case-insensitive)
#    - trim whitespace
generated_texts_gpt3_lora["Text"] = (
    generated_texts_gpt3_lora["Taged_Text"]
    .str.replace(r'^\s*<\s*\d+\s*>\s*', '', regex=True)
    .str.replace(r'\s*<end>.*$', '', regex=True, case=False)
    .str.strip()
)

In [None]:
# https://colab.research.google.com/github/stanfordnlp/stanza/blob/master/demo/Stanza_CoreNLP_Interface.ipynb#scrollTo=WP4Dz6PIJHeL

In [None]:
!pip install stanza

In [None]:
import stanza

In [None]:
rm -rf corenlp

In [None]:
# Download the Stanford CoreNLP package with Stanza's installation command
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

In [None]:
# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

In [None]:
!ls $CORENLP_HOME

In [None]:
!export CORENLP_HOME=path_to_corenlp_dir

In [None]:
# Import client module
from stanza.server import CoreNLPClient

In [None]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    timeout=70000,
    #annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner'],
    annotators=['tokenize','ssplit','pos','lemma','ner', 'parse', 'depparse','coref'],
    memory='4G',
    endpoint='http://localhost:9001',
    be_quiet=True)
print(client)

In [None]:
# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

In [None]:
# Print background processes and look for java
!ps -o pid,cmd | grep java

In [None]:
# CorNLP on dataset:

#generated_texts_gpt3

text_sentences_gpt3 = generated_texts_gpt3['Text'].tolist()

text_sentences_gpt3_pared = []
index_of_error_gpt3 = []

print("Total number of Samples:", len(generated_texts_gpt3))

for i in range(len(text_sentences_gpt3)):
  try:
    if i % 1000 == 0:
        print(i)
    document = client.annotate(text_sentences_gpt3[i])
    text_sentences_gpt3_pared.append(document)
  except:
    index_of_error_gpt3.append(i)
    text_sentences_gpt3_pared.append(i)

text_sentences_lora_pared = []
index_of_error_lora = []

In [None]:
#generated_texts_gpt3_lora

text_sentences_gpt3_lora = generated_texts_gpt3_lora['Text'].tolist()

text_sentences_gpt3_lora_pared = []
index_of_error_gpt3_lora = []

print("Total number of Samples:", len(generated_texts_gpt3_lora))

for i in range(len(text_sentences_gpt3_lora)):
  try:
    if i % 1000 == 0:
        print(i)
    document = client.annotate(text_sentences_gpt3_lora[i])
    text_sentences_gpt3_lora_pared.append(document)
  except:
    text_sentences_gpt3_lora_pared.append(i)
    index_of_error_gpt3_lora.append(i)

In [None]:
# Removing errors:

print(index_of_error_gpt3)
print(index_of_error_gpt3_lora)

In [None]:
# Remove from Dataset:
generated_texts_gpt3 = generated_texts_gpt3.drop(index=index_of_error_gpt3)
generated_texts_gpt3.reset_index(drop=True, inplace=True)

generated_texts_gpt3_lora = generated_texts_gpt3_lora.drop(index=index_of_error_gpt3_lora)
generated_texts_gpt3_lora.reset_index(drop=True, inplace=True)

In [None]:
# Remove the elements with biger index first:

index_of_error_gpt3.reverse()
index_of_error_gpt3_lora.reverse()

In [None]:
print(index_of_error_gpt3)
print(index_of_error_gpt3_lora)

In [None]:
for x in index_of_error_gpt3:
  del text_sentences_gpt3_pared[x]

In [None]:
for x in index_of_error_gpt3_lora:
  del text_sentences_gpt3_lora_pared[x]

In [None]:
# Adding CoreNLP to Dataset:

for i in range(len(generated_texts_gpt3)):
    generated_texts_gpt3.loc[i, 'parse_tree'] = str(text_sentences_gpt3_pared[i].sentence[0].parseTree)
    generated_texts_gpt3.loc[i, 'pos']        = str([t.pos for t in text_sentences_gpt3_pared[i].sentence[0].token])
    generated_texts_gpt3.loc[i, 'dependency'] = text_sentences_gpt3_pared[i].sentence[0].enhancedPlusPlusDependencies.edge


for i in range(len(generated_texts_gpt3_lora)):
    generated_texts_gpt3_lora.loc[i, 'parse_tree'] = str(text_sentences_gpt3_lora_pared[i].sentence[0].parseTree)
    generated_texts_gpt3_lora.loc[i, 'pos']        = str([t.pos for t in text_sentences_gpt3_lora_pared[i].sentence[0].token])
    generated_texts_gpt3_lora.loc[i, 'dependency'] = text_sentences_gpt3_lora_pared[i].sentence[0].enhancedPlusPlusDependencies.edge

In [None]:
generated_texts_gpt3.head(1)

In [None]:
generated_texts_gpt3_lora.head(1)

In [None]:
# "pos"           string --> list
# "dependency"    string --> list of dictionary

In [None]:
import ast

generated_texts_gpt3["pos"] = generated_texts_gpt3["pos"].apply(lambda x: ast.literal_eval(x))
generated_texts_gpt3_lora["pos"] = generated_texts_gpt3_lora["pos"].apply(lambda x: ast.literal_eval(x))

In [None]:
import re
import pandas as pd

def parse_dependency(dep_str):
    """
    Convert a dependency string into a list of dictionaries.
    """
    if pd.isna(dep_str) or not isinstance(dep_str, str):
        return []

    # Remove outer square brackets if present
    dep_str = dep_str.strip().lstrip("[").rstrip("]")

    # Split into blocks by ', source:' while keeping the first 'source:'
    parts = re.split(r',\s*source:', dep_str)
    parts = [p if p.strip().startswith("source:") else "source: " + p.strip() for p in parts]

    dep_list = []
    for block in parts:
        entry = {}
        # Match key: value (handles quoted strings)
        matches = re.findall(r'(\w+):\s*("?[^"\n]+"?|\w+)', block)
        for k, v in matches:
            v = v.strip('"')
            # Convert booleans and ints
            if v.lower() == "true":
                v = True
            elif v.lower() == "false":
                v = False
            else:
                try:
                    v = int(v)
                except ValueError:
                    pass
            entry[k] = v
        if entry:
            dep_list.append(entry)

    return dep_list


In [None]:
generated_texts_gpt3["dependency"] = generated_texts_gpt3["dependency"].astype(str)   # or convert to dict/JSON if you have a better serializer
generated_texts_gpt3_lora["dependency"] = generated_texts_gpt3_lora["dependency"].astype(str)   # or convert to dict/JSON if you have a better serializer

In [None]:
generated_texts_gpt3["dependency"] = generated_texts_gpt3["dependency"].apply(lambda x: parse_dependency(x))
generated_texts_gpt3_lora["dependency"] = generated_texts_gpt3_lora["dependency"].apply(lambda x: parse_dependency(x))

In [None]:
# Saving

In [None]:
generated_texts_gpt3.to_pickle("generated_texts_gpt3_09.13.2025.pkl")
generated_texts_gpt3_lora.to_pickle("generated_texts_gpt3_lora_09.13.2025.pkl")

In [None]:
print("Saved (^_^)")

In [None]:
# Shut down the background CoreNLP server
client.stop()

time.sleep(10)
!ps -o pid,cmd | grep java

# Features (Paper 1)

In [None]:
Dataset = generated_texts_gpt3.copy()

In [None]:
Dataset_lora = generated_texts_gpt3_lora.copy()

In [None]:
sentences_len = []
sentences_words = []

for sentence in Dataset["Text"]:
        sentences_len.append(len(sentence))
        sentences_words.append(len(sentence.split()))

# Add as new columns to the existing DataFrame
Dataset["Length"] = sentences_len
Dataset["Words"] = sentences_words

In [None]:
sentences_len = []
sentences_words = []

for sentence in Dataset_lora["Text"]:
        sentences_len.append(len(sentence))
        sentences_words.append(len(sentence.split()))

# Add as new columns to the existing DataFrame
Dataset_lora["Length"] = sentences_len
Dataset_lora["Words"] = sentences_words

In [None]:
Dataset.head(1)

In [None]:
Dataset_lora.head(1)

## Part of Speech

In [None]:
for i in range(len(Dataset)):

    tokens = Dataset.loc[i, "pos"]

    Dataset.at[i, 'verb_count']   = sum(1 for t in tokens if t.startswith('V'))
    Dataset.at[i, 'adverb_count'] = sum(1 for t in tokens if t.startswith('RB'))
    Dataset.at[i, 'noun_count']   = sum(1 for t in tokens if t.startswith('N'))
    Dataset.at[i, 'coord_count']  = sum(1 for t in tokens if t.startswith('CC'))
    Dataset.at[i, 'subord_count'] = sum(1 for t in tokens if t.startswith('IN'))

Dataset['verb_percent']        =  round(Dataset['verb_count']/Dataset['Words'], 2)
Dataset['adverb_percent']      =  round(Dataset['adverb_count']/Dataset['Words'], 2)
Dataset['noun_percent']        =  round(Dataset['noun_count']/Dataset['Words'], 2)
Dataset['coord_percent']  =  round(Dataset['coord_count']/Dataset['Words'], 2)
Dataset['subord_percent'] =  round(Dataset['subord_count']/Dataset['Words'], 2)

del i, tokens

In [None]:
for i in range(len(Dataset_lora)):

    tokens = Dataset_lora.loc[i, "pos"]

    Dataset_lora.at[i, 'verb_count']   = sum(1 for t in tokens if t.startswith('V'))
    Dataset_lora.at[i, 'adverb_count'] = sum(1 for t in tokens if t.startswith('RB'))
    Dataset_lora.at[i, 'noun_count']   = sum(1 for t in tokens if t.startswith('N'))
    Dataset_lora.at[i, 'coord_count']  = sum(1 for t in tokens if t.startswith('CC'))
    Dataset_lora.at[i, 'subord_count'] = sum(1 for t in tokens if t.startswith('IN'))

Dataset_lora['verb_percent']        =  round(Dataset_lora['verb_count']/Dataset_lora['Words'], 2)
Dataset_lora['adverb_percent']      =  round(Dataset_lora['adverb_count']/Dataset_lora['Words'], 2)
Dataset_lora['noun_percent']        =  round(Dataset_lora['noun_count']/Dataset_lora['Words'], 2)
Dataset_lora['coord_percent']  =  round(Dataset_lora['coord_count']/Dataset_lora['Words'], 2)
Dataset_lora['subord_percent'] =  round(Dataset_lora['subord_count']/Dataset_lora['Words'], 2)

del i, tokens

## Path and Depth

In [None]:
# Calculate the longest path between two leaves in the tree.

def longest_path(node):

    if not node.get("children"):
        # Leaf node, no children
        return 0, 0

    max_depth = 0
    second_max_depth = 0
    max_path = 0

    for child in node["children"]:
        path, depth = longest_path(child)
        if depth > max_depth:
            second_max_depth = max_depth
            max_depth = depth
        elif depth > second_max_depth:
            second_max_depth = depth

        max_path = max(max_path, path)

    # Longest path through this node
    longest_through_root = max_depth + second_max_depth + 2

    return max(max_path, longest_through_root), max_depth + 1


def tree_depth(node):
    """
    Calculate the depth of the tree.
    """
    if not node.get("children"):
        # Leaf node, no children
        return 1

    return 1 + max(tree_depth(child) for child in node["children"])


import re

def build_tree(data):
    """
    Build a tree from the string representation.
    """
    # Split the data into lines
    lines = data.split('\n')

    # Create a root node
    tree = {"children": []}
    current_node = tree
    stack = []

    # Regex pattern to identify node attributes
    pattern = re.compile(r'(value|score): "?(.*?)"?\s*$')

    for line in lines:
        line = line.strip()
        if line.startswith("child {"):
            # Start a new child node
            new_node = {"children": []}
            if current_node is not None:
                current_node["children"].append(new_node)
            stack.append(current_node)
            current_node = new_node
        elif line == "}":
            # End of the current node, pop from stack
            current_node = stack.pop()
        else:
            # Extract attributes
            match = pattern.search(line)
            if match:
                current_node[match.group(1)] = match.group(2)

    return tree["children"][0]

In [None]:
def longest_path_and_depth(node):
    data = str(node)
    tree = build_tree(data)
    longest_path_length, _ = longest_path(tree)
    depth = tree_depth(tree)

    return longest_path_length, depth

In [None]:
Dataset['longest_path_parstree'], Dataset['depth_parstree'] = zip(*Dataset['parse_tree'].apply(longest_path_and_depth))

In [None]:
Dataset_lora['longest_path_parstree'], Dataset_lora['depth_parstree'] = zip(*Dataset_lora['parse_tree'].apply(longest_path_and_depth))

## Questions

In [None]:
def questions (node):
    S_question    = 1 if "value: \"SQ\""    in str(node) else 0
    S_question_wh = 1 if "value: \"SBARQ\"" in str(node) else 0
    return S_question, S_question_wh

In [None]:
Dataset['questions'], Dataset['questions_wh'] = zip(*Dataset['parse_tree'].apply(questions))

In [None]:
Dataset_lora['questions'], Dataset_lora['questions_wh'] = zip(*Dataset_lora['parse_tree'].apply(questions))

## Pronouns

In [None]:
def pronouns  (node):
    PRP_first  = node.lower().count("value: \"i\"") + node.lower().count("value: \"we\"")
    PRP_second = node.lower().count("value: \"you\"")
    PRP_third  = node.lower().count("value: \"he\"") + node.lower().count("value: \"she\"") + node.lower().count("value: \"it\"") + node.lower().count("value: \"they\"")

    return PRP_first, PRP_second, PRP_third

In [None]:
Dataset['PRP_first'], Dataset['PRP_second'], Dataset['PRP_third'] = zip(*Dataset['parse_tree'].apply(pronouns))

In [None]:
Dataset_lora['PRP_first'], Dataset_lora['PRP_second'], Dataset_lora['PRP_third'] = zip(*Dataset_lora['parse_tree'].apply(pronouns))

In [None]:
Dataset['PRP_first_percent']   =  round(Dataset['PRP_first']/Dataset['Words'], 2)
Dataset['PRP_second_percent']  =  round(Dataset['PRP_second']/Dataset['Words'], 2)
Dataset['PRP_third_percent']   =  round(Dataset['PRP_third']/Dataset['Words'], 2)

In [None]:
Dataset_lora['PRP_first_percent']   =  round(Dataset_lora['PRP_first']/Dataset_lora['Words'], 2)
Dataset_lora['PRP_second_percent']  =  round(Dataset_lora['PRP_second']/Dataset_lora['Words'], 2)
Dataset_lora['PRP_third_percent']   =  round(Dataset_lora['PRP_third']/Dataset_lora['Words'], 2)

## phrases

In [None]:
def phrases (node):
    S_noun       = node.count("value: \"NP\"")
    S_quantifier = node.count("value: \"QP\"")
    return S_noun, S_quantifier

In [None]:
Dataset['noun_phrase'], Dataset['quantifier_phrase'] = zip(*Dataset['parse_tree'].apply(phrases))

In [None]:
Dataset_lora['noun_phrase'], Dataset_lora['quantifier_phrase'] = zip(*Dataset_lora['parse_tree'].apply(phrases))

In [None]:
# Droping quantifier_phrase
Dataset.drop(columns=['quantifier_phrase'], inplace=True)

In [None]:
# Droping quantifier_phrase
Dataset_lora.drop(columns=['quantifier_phrase'], inplace=True)

# Features (Paper 2)

#### Number of Words

In [None]:
Dataset['Words'] = Dataset['pos'].apply(lambda x: len(x))

In [None]:
Dataset_lora['Words'] = Dataset_lora['pos'].apply(lambda x: len(x))

#### 1. Passive Sentences

In [None]:
import spacy

# Load the language model
nlp = spacy.load('en_core_web_sm')

In [None]:
import pandas as pd

def is_passive(sentence):
    # Parse the sentence using SpaCy
    doc = nlp(sentence)

    # Check for passive voice
    for token in doc:
        if token.dep_ == "auxpass":
            return True
    return False


def is_agentless_passive(sentence):
    doc = nlp(sentence)
    has_auxpass = False
    has_agent = False

    for token in doc:
        if token.dep_ == "auxpass":
            has_auxpass = True
        if token.dep_ == "agent":
            has_agent = True

    # Check for passive voice and absence of an agent
    return has_auxpass and not has_agent

In [None]:
def passive_detector(sentence):
    if is_agentless_passive(sentence):
        return "Agentless Passive"
    elif is_passive(sentence):
        return "Passive"
    else:
        return "Active"

In [None]:
# Apply the function on the 'Sentence' column
Dataset.loc[:, 'Passive'] = Dataset['Text'].apply(passive_detector)

In [None]:
# Apply the function on the 'Sentence' column
Dataset_lora.loc[:, 'Passive'] = Dataset_lora['Text'].apply(passive_detector)

In [None]:
Dataset.groupby('Label')['Passive'].value_counts()

In [None]:
Dataset_lora.groupby('Label')['Passive'].value_counts()

#### 2. Comparative & Superlative

In [None]:
import re

def parse_tree_structure(tree_string):
    # Function to convert the custom tree structure to a dictionary
    def parse_node(text):
        stack = []
        current_node = {}
        current_key = None
        current_value = None

        for token in re.finditer(r'(\w+)|[{}]', text):
            if token.group() == '{':
                stack.append((current_node, current_key))
                current_node = {}
                current_key = None
            elif token.group() == '}':
                if current_key:
                    current_node[current_key] = current_value
                if stack:
                    parent_node, parent_key = stack.pop()
                    if parent_key not in parent_node:
                        parent_node[parent_key] = []
                    parent_node[parent_key].append(current_node)
                    current_node = parent_node
                    current_key = None
            else:
                if current_key is None:
                    current_key = token.group()
                else:
                    current_value = token.group()
                    current_node[current_key] = current_value
                    current_key = None
                    current_value = None

        return current_node

    return parse_node(tree_string)



def Comparative_Superlative(tree_string):

    # Parse the string representation of the tree into a dictionary
    tree_dict = parse_tree_structure(tree_string)

    stack = [tree_dict]

    while stack:
        current = stack.pop()

        if isinstance(current, dict):
            for key, value in current.items():
                if key == 'value' and (value == 'JJR' or value == 'RBR'):
                    return "Comparative"
                if key == 'value' and (value == 'JJS' or value == 'RBS'):
                    return "Superlative"
                if key == 'child':
                    stack.extend(value)

    return "Not"

In [None]:
# Apply the function on the 'Sentence' column
Dataset.loc[:, 'Compare_Super'] = Dataset['parse_tree'].apply(Comparative_Superlative)

In [None]:
# Apply the function on the 'Sentence' column
Dataset_lora.loc[:, 'Compare_Super'] = Dataset_lora['parse_tree'].apply(Comparative_Superlative)

In [None]:
Dataset.groupby('Label')['Compare_Super'].value_counts()

In [None]:
Dataset_lora.groupby('Label')['Compare_Super'].value_counts()

#### 3. Search for CONJP

In [None]:
import pandas as pd
import json

# Function to check if 'CONJP' exists in the 'parsstree' JSON
def contains_conjp(json_str):
    try:
        # Check if 'CONJP' is present in the values of the JSON structure
        return 'CONJP' in json_str
    except json.JSONDecodeError:
        return False


In [None]:
# Apply the function to the 'parsstree' column
Dataset.loc[:, 'CONJP'] = Dataset['parse_tree'].apply(contains_conjp)

In [None]:
Dataset_lora.loc[:, 'CONJP'] = Dataset_lora['parse_tree'].apply(contains_conjp)

In [None]:
Dataset.groupby('Label')['CONJP'].value_counts()

In [None]:
Dataset_lora.groupby('Label')['CONJP'].value_counts()

#### 4. Imperative Sentences

In [None]:
#!pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import ast

def imperative_detector(pos):
    if pos[0] == "VB" and pos[-1] == ".":
        return True
    return False

In [None]:
# Apply the function to the 'parsstree' column
Dataset.loc[:, 'Imperative'] = Dataset['pos'].apply(imperative_detector)

In [None]:
Dataset_lora.loc[:, 'Imperative'] = Dataset_lora['pos'].apply(imperative_detector)

In [None]:
Dataset['Imperative'].value_counts()

In [None]:
Dataset_lora['Imperative'].value_counts()

#### 5. Nominal Subject

In [None]:
'''import spacy

def Nsubj(sentence):

    doc = client.annotate(sentence)
    edges = doc.sentence[0].enhancedPlusPlusDependencies.edge


    # 0 -> There is not a nsubj
    # 1 -> nsubj is pronoun
    # 2 -> nsubj is noun

    for edge in edges:
        if edge.dep == "nsubj":
            index = edge.target - 1
            if doc.sentence[0].token[index].pos == "PRP":
                return 1 #pronoun
            else:
                return 2 #noun
    return 0 #not'''
pass

In [None]:
'''# Apply the function to the 'sentence' column
Dataset.loc[:, 'Nsubj'] = Dataset['Sentence'].apply(Nsubj)'''
pass

In [None]:
import spacy

def Nsubj(deps_list, pos):
    print(len(deps_list))

    # 0 -> There is not a nsubj
    # 1 -> nsubj is pronoun
    # 2 -> nsubj is noun

    if len(pos) == 1:
        return 0

    for dep_info in deps_list:
        if dep_info["dep"] == "nsubj":
            index = dep_info["target"] - 1
            if pos[index] == "PRP":
                return 1 #pronoun
            else:
                return 2 #noun
    return 0 #not

In [None]:
Dataset["Nsubj"] = Dataset.apply(lambda row: Nsubj(row["dependency"], row["pos"]), axis=1)
Dataset_lora["Nsubj"] = Dataset_lora.apply(lambda row: Nsubj(row["dependency"], row["pos"]), axis=1)

In [None]:
Dataset['Nsubj'] = Dataset['Nsubj'].astype(int)

In [None]:
Dataset_lora['Nsubj'] = Dataset_lora['Nsubj'].astype(int)

In [None]:
Dataset.groupby('Label')['Nsubj'].value_counts()

In [None]:
Dataset_lora.groupby('Label')['Nsubj'].value_counts()

#### Prepositional Phrase

In [None]:
import re

# Function to convert the custom tree structure to a dictionary

def parse_tree_structure(tree_string):
    stack = []
    current_node = {}
    current_key = None
    current_value = None

    for token in re.finditer(r'(\w+)|[{}]', tree_string):
        if token.group() == '{':
            stack.append((current_node, current_key))
            current_node = {}
            current_key = None
        elif token.group() == '}':
            if current_key:
                current_node[current_key] = current_value
            if stack:
                parent_node, parent_key = stack.pop()
                if parent_key not in parent_node:
                    parent_node[parent_key] = []
                parent_node[parent_key].append(current_node)
                current_node = parent_node
                current_key = None
        else:
            if current_key is None:
                current_key = token.group()
            else:
                current_value = token.group()
                current_node[current_key] = current_value
                current_key = None
                current_value = None
    return current_node

In [None]:
def find_leaves_with_pp_ancestor(tree, parent_has_pp=False):
    leaves = []

    # Check if the current node has 'value': 'PP'
    if tree.get('value') == 'PP':
        parent_has_pp = True

    # If this node has children, recurse
    if 'child' in tree and isinstance(tree['child'], list):
        for child in tree['child']:
            leaves.extend(find_leaves_with_pp_ancestor(child, parent_has_pp))
    else:
        # If this is a leaf node and its parent or ancestor had 'value': 'PP'
        if (parent_has_pp and tree.get('value') is not None) and (parent_has_pp and tree.get('value') != '342'):
            leaves.append(tree.get('value'))

    return leaves

In [None]:
def percentage_of_PP(tree_string,words):
    dic_tree = parse_tree_structure(tree_string)
    list_of_pp = find_leaves_with_pp_ancestor(dic_tree)

    return round(len(list_of_pp)/words, 2)

In [None]:
# Runing on the all of Samples:

Dataset['PP_Percent'] = 0.0

for i in range(len(Dataset)):
    percent = percentage_of_PP(Dataset.loc[i]['parse_tree'], Dataset.loc[i]['Words'])
    Dataset.loc[i, 'PP_Percent'] = percent

In [None]:
# Runing on the all of Samples:

Dataset_lora['PP_Percent'] = 0.0

for i in range(len(Dataset_lora)):
    percent = percentage_of_PP(Dataset_lora.loc[i]['parse_tree'], Dataset_lora.loc[i]['Words'])
    Dataset_lora.loc[i, 'PP_Percent'] = percent

In [None]:
# Checking the reults:
smaler_than_0 = Dataset[Dataset['PP_Percent'] < 0].index
bigger_than_1 = Dataset[Dataset['PP_Percent'] > 1].index

print(len(smaler_than_0), len(bigger_than_1))

In [None]:
# Checking the reults:
smaler_than_0 = Dataset_lora[Dataset_lora['PP_Percent'] < 0].index
bigger_than_1 = Dataset_lora[Dataset_lora['PP_Percent'] > 1].index

print(len(smaler_than_0), len(bigger_than_1))

In [None]:
# Function to find words based on the type of parent node
def find_leaves_with_pp_parent_type(tree, parent_tag=None, parent_of_pp=None):
    leaves_vp = []
    leaves_np = []
    leaves_other = []

    # Check if the current node has 'value': 'PP'
    if tree.get('value') == 'PP':
        parent_of_pp = parent_tag

    # If this node has children, recurse
    if 'child' in tree and isinstance(tree['child'], list):
        for child in tree['child']:
            vp, np, other = find_leaves_with_pp_parent_type(child, tree.get('value'), parent_of_pp)
            leaves_vp.extend(vp)
            leaves_np.extend(np)
            leaves_other.extend(other)
    else:
        # If this is a leaf node and its parent or ancestor had 'value': 'PP'
        if parent_of_pp == 'VP' and tree.get('value') is not None and tree.get('value') != '342':
            leaves_vp.append(tree.get('value'))
        elif parent_of_pp == 'NP' and tree.get('value') is not None and tree.get('value') != '342':
            leaves_np.append(tree.get('value'))
        elif parent_of_pp is not None and tree.get('value') is not None and tree.get('value') != '342':
            leaves_other.append(tree.get('value'))

    return leaves_vp, leaves_np, leaves_other

In [None]:
# Example usage:
sample = 0

leaves_vp, leaves_np, leaves_other = find_leaves_with_pp_parent_type(parse_tree_structure(Dataset.loc[sample]["parse_tree"]))

print("Leaves with PP parent under VP:", leaves_vp)
print("Leaves with PP parent under NP:", leaves_np)
print("Leaves with PP parent under other:", leaves_other)

In [None]:
def percentage_of_PP_for_VP_NP_Other(tree_string,words):
    dic_tree = parse_tree_structure(tree_string)
    leaves_vp, leaves_np, leaves_other = find_leaves_with_pp_parent_type(dic_tree)

    return round(len(leaves_vp)/words, 2), round(len(leaves_np)/words, 2), round(len(leaves_other)/words, 2)

In [None]:
percentage_of_PP_for_VP_NP_Other(Dataset.loc[sample]["parse_tree"],Dataset.loc[sample]["Words"])

In [None]:
percentage_of_PP_for_VP_NP_Other(Dataset_lora.loc[sample]["parse_tree"],Dataset_lora.loc[sample]["Words"])

In [None]:
# Runing on the all of Samples:

Dataset['PP_VP_Percent'] = 0.0
Dataset['PP_NP_Percent'] = 0.0
Dataset['PP_O_Percent'] = 0.0

for i in range(len(Dataset)):
    pp_vp_percent, pp_np_percent, pp_other_percent = percentage_of_PP_for_VP_NP_Other(Dataset.loc[i]['parse_tree'], Dataset.loc[i]['Words'])
    Dataset.loc[i, 'PP_VP_Percent'] = pp_vp_percent
    Dataset.loc[i, 'PP_NP_Percent'] = pp_np_percent
    Dataset.loc[i, 'PP_O_Percent']  = pp_other_percent

In [None]:
# Runing on the all of Samples:

Dataset_lora['PP_VP_Percent'] = 0.0
Dataset_lora['PP_NP_Percent'] = 0.0
Dataset_lora['PP_O_Percent'] = 0.0

for i in range(len(Dataset_lora)):
    pp_vp_percent, pp_np_percent, pp_other_percent = percentage_of_PP_for_VP_NP_Other(Dataset_lora.loc[i]['parse_tree'], Dataset_lora.loc[i]['Words'])
    Dataset_lora.loc[i, 'PP_VP_Percent'] = pp_vp_percent
    Dataset_lora.loc[i, 'PP_NP_Percent'] = pp_np_percent
    Dataset_lora.loc[i, 'PP_O_Percent']  = pp_other_percent

In [None]:
Dataset.loc[0,['PP_Percent', 'PP_VP_Percent', 'PP_NP_Percent', 'PP_O_Percent']]

In [None]:
Dataset_lora.loc[0,['PP_Percent', 'PP_VP_Percent', 'PP_NP_Percent', 'PP_O_Percent']]

In [None]:
"""# Make "Dependency" saveable (cahnge to string)

import pickle

bad_cols = []
for col in generated_texts_gpt3.columns:
    try:
        # test serializability of the whole column
        pickle.dumps(generated_texts_gpt3[col].tolist())
    except Exception as e:
        bad_cols.append(col)
bad_cols"""

pass

## Saving

In [None]:
Dataset.to_pickle("Dataset_generated_texts_gpt3_09.13.2025.pkl")
Dataset_lora.to_pickle("Dataset_generated_texts_gpt3_lora_09.13.2025.pkl")

# Analyzing Features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
Dataset.columns

In [None]:
Dataset_lora.columns

In [None]:
columns_num =  ['Words',
                'depth_parstree',
                #'longest_word_length',
                'longest_path_parstree',
                'verb_percent',
                'adverb_percent',
                'noun_percent',
                'coord_percent',
                'subord_percent',
                'PRP_first_percent',
                'PRP_second_percent',
                'PRP_third_percent',
                'noun_phrase',
                'PP_Percent',
                'PP_VP_Percent',
                'PP_NP_Percent',
                'PP_O_Percent']

columns_cat =  ['questions',
                'questions_wh',
                'Passive',
                'Compare_Super',
                'CONJP',
                'Imperative',
                'Nsubj']

In [None]:
print("Numerical Data: \n")

for col in columns_num:
    plt.figure(figsize=(7, 5))
    sns.kdeplot(data=Dataset,
                x=col,
                hue='Author',
                fill=True,
                hue_order=[
                    "Charles Dickens",
                    "Jane Austen",
                    "Louisa May Alcott",
                    "Mark Twain",
                    "Herman Melville"
                ])
    plt.title('')
    plt.xlabel(col)
    plt.ylabel('Quantity')
    plt.show()

In [None]:
print("Numerical Data: \n")

for col in columns_num:
    plt.figure(figsize=(7, 5))
    sns.kdeplot(data=Dataset_lora,
                x=col,
                hue='Author',
                fill=True,
                hue_order=[
                    "Charles Dickens",
                    "Jane Austen",
                    "Louisa May Alcott",
                    "Mark Twain",
                    "Herman Melville"
                ])
    plt.title('')
    plt.xlabel(col)
    plt.ylabel('Quantity')
    plt.show()

In [None]:
"""print("\n Catgorical Data: \n")

for col in columns_cat:
    plt.figure(figsize=(7, 5))
    sns.histplot(data=Dataset, x=col, hue='Author', element='step', multiple='dodge')
    plt.title('')
    plt.xlabel(col)
    plt.ylabel('Quantity')
    plt.show()"""
pass