In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textstat.textstat import textstatistics
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec, KeyedVectors
import nltk
import pickle

In [2]:
frame = pd.read_csv('data.csv')
frame

Unnamed: 0,text,source,prompt_id,text_length,word_count
0,"Federal law supersedes state law, and cannabis...",Bloom-7B,0,967,157
1,Miles feels restless after working all day. He...,Bloom-7B,0,5068,778
2,So first of I am danish. That means that I fol...,Bloom-7B,0,1602,267
3,In this paper we present a novel rule-based ap...,Bloom-7B,0,5469,848
4,"Most social progressives, love democracy, and ...",Bloom-7B,0,2379,380
...,...,...,...,...,...
788917,"\nIn the vast expanse of time, where the echoe...",YI-34B,1293,5523,870
788918,"\nThe phenomenon of brain drain, particularly ...",YI-34B,1154,4540,677
788919,\nThe Influence of Climate Change on Marine Ec...,YI-34B,2783,3889,598
788920,\nTitle: The Case for Limiting Car Usage: Navi...,YI-34B,41,3560,533


# Data Pre-processing and Feature Extraction

In [3]:
prompts_df = pd.read_csv('prompts.csv')
data_df = pd.read_csv('data.csv')
# Merging the DataFrames on 'prompt_id'
df_merged = pd.merge(data_df, prompts_df, left_on='prompt_id', right_on='Prompt ID', how='left')

# Optionally, if you don't need the 'Prompt ID' column anymore, you can drop it
df_merged.drop('prompt_id', axis=1, inplace=True)
df_merged.drop('Prompt ID', axis=1, inplace=True)
df_merged.head()

Unnamed: 0,text,source,text_length,word_count,Prompt
0,"Federal law supersedes state law, and cannabis...",Bloom-7B,967,157,Undefined
1,Miles feels restless after working all day. He...,Bloom-7B,5068,778,Undefined
2,So first of I am danish. That means that I fol...,Bloom-7B,1602,267,Undefined
3,In this paper we present a novel rule-based ap...,Bloom-7B,5469,848,Undefined
4,"Most social progressives, love democracy, and ...",Bloom-7B,2379,380,Undefined


In [4]:
df = df_merged
df.shape

(788922, 5)

# Feature Extraction using Word2Vec

In [5]:
# Ensure nltk resources are downloaded (needed for tokenization)
nltk.download('punkt')

def lexical_diversity(text):
    tokens = word_tokenize(text)
    return len(set(tokens)) / len(tokens) if tokens else 0

def average_sentence_length(text):
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text)
    return len(tokens) / len(sentences) if sentences else 0

def compute_readability_score(text):
    return textstatistics().flesch_reading_ease(text)

def compute_text_length_metrics(text):
    chars = len(text)
    words = len(word_tokenize(text))
    sentences = len(sent_tokenize(text))
    return chars, words, sentences

def compute_prompt_features(prompt):
    prompt_length = len(prompt)
    return prompt_length

def extract_nlp_embeddings(text, model):
    tokens = word_tokenize(text.lower())
    tokens_in_model = [word for word in tokens if word in model.key_to_index]
    if not tokens_in_model:
        return np.zeros(model.vector_size)
    embeddings = np.mean([model.get_vector(word) for word in tokens_in_model], axis=0)
    return embeddings

def add_features_to_df(df, model, pca):
    df['lexical_diversity'] = df['text'].apply(lexical_diversity)
    df['avg_sentence_length'] = df['text'].apply(average_sentence_length)
    df['readability_score'] = df['text'].apply(compute_readability_score)
    
    text_length_metrics = df['text'].apply(compute_text_length_metrics)
    df['char_count'], df['word_count'], df['sentence_count'] = zip(*text_length_metrics)
    
    df['prompt_length'] = df['Prompt'].apply(compute_prompt_features)
    
    df['classification'] = df['source'].apply(lambda x: 0 if x == 'Human' else 1)
    
    embeddings = np.array([extract_nlp_embeddings(text, model) for text in df['text']])
    reduced_embeddings = pca.transform(embeddings)
    
    for i in range(reduced_embeddings.shape[1]):
        df[f'embedding_{i}'] = reduced_embeddings[:, i]
    
    return df

# Load the Word2Vec model
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Assuming df is your DataFrame containing the text data
# Extract embeddings for PCA fitting
embeddings = np.array([extract_nlp_embeddings(text, model) for text in df['text']])

# Fit PCA on the embeddings
pca = PCA(n_components=50)
pca.fit(embeddings)

# Save the fitted PCA model
with open('pca.pkl', 'wb') as file:
    pickle.dump(pca, file)

# Now, transform your dataset
df_transformed = add_features_to_df(df, model, pca)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cibhibaskar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Save the final transformed data to a csv file.

In [6]:
df_transformed.to_csv('corpus1.csv')

In [7]:
final_df = pd.read_csv('corpus1.csv')
final_df.head()

Unnamed: 0.1,Unnamed: 0,text,source,text_length,word_count,Prompt,lexical_diversity,avg_sentence_length,readability_score,char_count,...,embedding_40,embedding_41,embedding_42,embedding_43,embedding_44,embedding_45,embedding_46,embedding_47,embedding_48,embedding_49
0,0,"Federal law supersedes state law, and cannabis...",Bloom-7B,967,181,Undefined,0.80663,25.857143,57.3,967,...,-0.029518,-0.016951,-0.032117,-0.044339,0.0293,-0.015795,-0.044024,0.020272,-0.027501,0.014064
1,1,Miles feels restless after working all day. He...,Bloom-7B,5068,924,Undefined,0.661255,23.1,53.21,5068,...,-0.006635,0.0369,0.017577,-0.005283,0.028798,0.000743,0.032611,0.042589,0.047794,-0.021118
2,2,So first of I am danish. That means that I fol...,Bloom-7B,1602,316,Undefined,0.718354,22.571429,61.97,1602,...,0.003265,0.007507,0.005782,-0.01057,0.009538,0.009079,-0.002349,-9.8e-05,-0.005189,0.001135
3,3,In this paper we present a novel rule-based ap...,Bloom-7B,5469,1015,Undefined,0.564532,40.6,27.86,5469,...,-0.024777,-0.00817,-0.016804,-0.005168,-0.006794,-0.002699,0.004904,-0.00505,-0.010434,-0.004464
4,4,"Most social progressives, love democracy, and ...",Bloom-7B,2379,437,Undefined,0.75286,23.0,61.67,2379,...,0.013447,-0.010881,0.001363,-0.017231,0.002769,-0.008431,0.011807,-0.013886,-0.011405,0.025637
