In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier

In [3]:
# Data from https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro, applied this func to transform data
def make_into_dataset(df):
    mid = int(len(df) / 2)
    x  = df.iloc[:mid, :]['wiki_intro'].to_numpy()
    y = df.iloc[mid:, :]['generated_text'].to_numpy()

    new_df = {"text" : np.concatenate([x,y]), "ai_generated": np.concatenate([np.zeros(mid, dtype=int), np.ones(mid, dtype=int)])}
    new_df = pd.DataFrame(new_df).sample(frac=1).reset_index(drop=True)
    return new_df


In [4]:
df = pd.read_csv('../dataset/llm-classification-data.csv')
df

Unnamed: 0,text,ai_generated
0,Dr. Foss Westcott (23 October 186319 October 1...,0
1,The Blacksburg Electronic Village or BEV was c...,0
2,a retired French footballer who played as a m...,1
3,Japanese-language dictionary. It is based on ...,1
4,"drama film, produced and directed by K. Balac...",1
...,...,...
149995,"am Main – January 22, 1928 in Frankfurt am Ma...",1
149996,Shawnae Marie Dixon is an American professiona...,0
149997,an American professional basketball executive...,1
149998,Düsseldorf Baskets was a professional basketba...,0


In [23]:
def get_subset(df, x):
    class_0_data = df[df["ai_generated"] == 0]
    class_1_data = df[df["ai_generated"] == 1]

    min_samples = min(len(class_0_data), len(class_1_data), x)

    balanced_data = pd.concat([class_0_data.sample(min_samples), class_1_data.sample(min_samples)])
    balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

    return balanced_data

df = get_subset(df, 2000)
df

Unnamed: 0,text,ai_generated
0,Entwistle is an English surname. Notable peopl...,0
1,for buildings and structures in the UK. They ...,1
2,Henrietta Hutton (née Cooke) (1939–1963) was a...,0
3,"in 2001 by the advertising agency WPP, then k...",1
4,"Bartolomeu Perestrello (, in Italian Bartolome...",0
...,...,...
3995,Astley Abbotts is a village and civil parish i...,0
3996,Michael Brandon Lake is an American Christian ...,0
3997,Krishna Kumar Goyal (1932/1933 - 21 April 2013...,0
3998,is the Founder and President of the David Bri...,1


In [25]:
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from xgboost import train


def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text
tqdm.pandas()
df['text'] = df['text'].progress_apply(text_cleaning)

  soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
100%|██████████| 4000/4000 [00:00<00:00, 6123.88it/s]


In [22]:
vec = TfidfVectorizer(min_df= 20,
                      max_df=0.05,
                      analyzer = 'char_wb',
                      ngram_range = (3,5),
                      max_features = 1000)
vec.fit(train['text'])
tfidf_matrix = vec.transform(train['text'])
tfidf_matrix

AttributeError: 'csr_matrix' object has no attribute 'lower'

In [9]:
max_corpus = 10000
X = np.pad(X, ((0, 0), (0, max_corpus - X.shape[1])), 'constant')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

ValueError: index can't contain negative values

In [None]:
# Define the hyperparameters grid for XGBoost
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 5]
}

In [None]:
# Initialize the XGBoost classifier
xgb = XGBClassifier()

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Predict on the test set using the best model
predictions = best_model.predict(X_test)
predictions

In [None]:
# Evaluate the best model
accuracy = accuracy_score(y_test, predictions)
roc = roc_auc_score(y_test, predictions)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("ROC AUC:", roc)