In [2]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report

#from wordcloud import WordCloud
#import matplotlib.pyplot as plt

try:
    from xgboost import XGBClassifier
except:
    import sys
    !{sys.executable} -m pip install xgboost
    print('Sikerült')


In [3]:
#nltk.download('punkt')
nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Dataset

In [4]:
data = pd.read_csv('research_abstract_labeled.csv', sep=";", encoding='cp1252')
data.head()

Unnamed: 0,title,label,text,word_count
0,Study of coupling loss on bi-columnar BSCCO/Ag...,0,Coupling losses were studied in composite tape...,280
1,Study of coupling loss on bi-columnar BSCCO/Ag...,1,"In this study, we investigate the coupling los...",215
2,Weighted Solyanik estimates for the strong max...,0,Let $\mathsf M_{\mathsf S}$ denote the strong ...,332
3,Weighted Solyanik estimates for the strong max...,1,"In this paper, we investigate Weighted Solyani...",225
4,SOFIA-EXES Observations of Betelgeuse during t...,0,In 2019 October Betelgeuse began a decline in ...,268


# Preprocessing

In [5]:
# Itt kigyűjtöm a címeket, amelyikeknél vagy az AI vagy az ember szöveg tartalmaz dollárjelet
formula_titles = data[data.text.str.contains('$', regex=False, na=False)].drop_duplicates(subset=['title'], keep='first').title.tolist()

# Kifilterezem mind a két absztraktot ahol legalább az egyikben van dollárjel
filtered = data[data.title.isin(formula_titles) != True]

filtered.label.value_counts()

label
0    6400
1    6400
Name: count, dtype: int64

In [6]:
filtered.word_count.describe()

count    12800.000000
mean       191.880703
std         88.157998
min         29.000000
25%        120.000000
50%        182.000000
75%        260.000000
max        594.000000
Name: word_count, dtype: float64

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def get_wordnet_pos(tag):

    """
    Pos tagging

    """

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  


def preprocess_text(text: str):
    
    """
    cleaning, lemmatization and stopword removal

    """

    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'\n+', ' ', text).strip()

    tokens = word_tokenize(text.lower())
    
    pos_tags = pos_tag(tokens)

    lemmatized_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in pos_tags
        if word not in stop_words
    ]


    return ' '.join(lemmatized_tokens)

# creating new var with clean texts
filtered['clean_text'] = filtered['text'].apply(preprocess_text)

#filtered.to_csv('clean_data.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['clean_text'] = filtered['text'].apply(preprocess_text)


In [None]:
# comparing raw and clean texts
for idx, row in filtered.head(100).iterrows():

    if row.label == 1:
        #print('Raw (AI):')
    
    else:
        #print('Raw (Human):')
      
    #print(row.text[0:200])
    #print('Clean:')   
    #print(row.clean_text[0:200])
    #print('________') 
    #print('')

Raw (Human):
In 2019 October Betelgeuse began a decline in V-band brightness that went beyond the minimum expected from its quasi-periodic ~420 day cycle, becoming the faintest in recorded photometric history. Obs
Clean:
2019 october betelgeuse begin decline vband brightness go beyond minimum expect quasiperiodic 420 day cycle become faint recorded photometric history observation obtain 2019 december vltsphere montarg
________

Raw (AI):
SOFIA-EXES Observations of Betelgeuse During the Great Dimming of 2019/2020

Betelgeuse, a red supergiant star in the constellation Orion, exhibited a significant dimming event starting in late 2019 t
Clean:
sofiaexes observation betelgeuse great dimming 20192020 betelgeuse red supergiant star constellation orion exhibit significant dimming event start late 2019 early 2020 phenomenon capture attention ama
________

Raw (Human):
Dust transport and deposition behind larger boulders on the comet 67P/Churyumov-Gerasimenko (67P/C-G) have been observed by th

In [9]:
filtered[filtered.apply(lambda row: row['text'].startswith(row['title']), axis=1)]['text'].iloc[8]

'Abelian regular coverings of the quaternion hypermap have been a topic of interest in graph theory and combinatorics for years. In this paper, we aim to provide a thorough analysis of the properties and behavior of these coverings. We introduce a new method to construct these coverings using projections onto a suitable plane. Through our analysis, we discover that the hypermaps subject to these coverings exhibit symmetries that are directly related to the covering group. Furthermore, we investigate the effect of various transformations on the quaternion hypermap, such as automorphisms and isomorphisms. Our findings reveal surprising relationships between different coverings and the underlying hypermap structure. Additionally, we discuss the connections between abelian regular coverings and other areas of mathematics, such as algebraic topology and group theory. The results presented in this paper contribute to a deeper understanding of the relationship between coverings and hypermaps 

# Model training

# Training

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import copy

# vectorization configs
vectorizer_configs = [
    ('tfidf', (1, 1)),
    ('tfidf', (1, 2)),
    ('tfidf', (2, 2)),
    ('count', (1, 1)),
    ('count', (1, 2)),
    ('count', (2, 2)),
]

# Models
models = {
    'logreg': LogisticRegression(max_iter=1000),
    'nbayes': MultinomialNB(),
    'xgboost': XGBClassifier(eval_metric='logloss')
}

# Train-test split
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    filtered['clean_text'], filtered['label'], test_size=0.2, random_state=15, stratify=filtered['label']
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

# Storage dictionary
trained_models = {}


for vec_type, ngram_range in vectorizer_configs:
    print(f'\n=== Vectorizer: {vec_type} | N-gram: {ngram_range} ===')

    VectorizerClass = TfidfVectorizer if vec_type == 'tfidf' else CountVectorizer
    

    # Models
    for model_name, model in models.items():
        print(f'\nModel: {model_name}')


        # vectorization
        vectorizer = VectorizerClass(ngram_range=ngram_range, max_df=0.8, min_df=20)

        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('model', model)
        ])

        # CV
        cv_scores = cross_val_score(
            pipeline,
            X_train_texts,
            y_train,
            cv=cv,
            scoring='accuracy',
            n_jobs=-1
        )

        mean_cv = np.mean(cv_scores)
        std_cv = np.std(cv_scores)

        # Refitting on training split
        pipeline.fit(X_train_texts, y_train)
        y_pred = pipeline.predict(X_test_texts)

        test_acc = accuracy_score(y_test, y_pred)
        clf_report = classification_report(y_test, y_pred, output_dict=True)
        vocab_size = len(pipeline.named_steps['vectorizer'].vocabulary_)

        print(f"| CV Acc: {mean_cv:.3f} ± {std_cv:.3f} | Test Acc: {test_acc:.3f} | Features: {vocab_size}")

        # modellek mentése mert később kellenek egyéb predikciókhoz
        key = f"{model_name}_{vec_type}_{ngram_range}"
        trained_models[key] = {
            'pipeline': copy.deepcopy(pipeline),
            'vectorizer_type': vec_type,
            'ngram_range': ngram_range,
            'cv_mean_accuracy': mean_cv,
            'cv_std': std_cv,
            'test_accuracy': test_acc,
            'classification_report': clf_report,
            'vocab_size': vocab_size,
            'y_pred': y_pred,                       
            'y_true': y_test.values                 
        }
        

        del pipeline




=== Vectorizer: tfidf | N-gram: (1, 1) ===

Model: logreg
| CV Acc: 0.946 ± 0.004 | Test Acc: 0.943 | Features: 3885

Model: nbayes
| CV Acc: 0.901 ± 0.009 | Test Acc: 0.889 | Features: 3885

Model: xgboost
| CV Acc: 0.948 ± 0.004 | Test Acc: 0.946 | Features: 3885

=== Vectorizer: tfidf | N-gram: (1, 2) ===

Model: logreg
| CV Acc: 0.953 ± 0.004 | Test Acc: 0.955 | Features: 6860

Model: nbayes
| CV Acc: 0.939 ± 0.005 | Test Acc: 0.942 | Features: 6860

Model: xgboost
| CV Acc: 0.949 ± 0.003 | Test Acc: 0.951 | Features: 6860

=== Vectorizer: tfidf | N-gram: (2, 2) ===

Model: logreg
| CV Acc: 0.929 ± 0.006 | Test Acc: 0.923 | Features: 2975

Model: nbayes
| CV Acc: 0.887 ± 0.006 | Test Acc: 0.886 | Features: 2975

Model: xgboost
| CV Acc: 0.913 ± 0.004 | Test Acc: 0.917 | Features: 2975

=== Vectorizer: count | N-gram: (1, 1) ===

Model: logreg
| CV Acc: 0.942 ± 0.003 | Test Acc: 0.948 | Features: 3885

Model: nbayes
| CV Acc: 0.919 ± 0.006 | Test Acc: 0.906 | Features: 3885

Model:

In [11]:
for key in ['logreg', 'nbayes', 'xgboost']:

    print(key)
    print(f'accuracy: {trained_models[f'{key}_count_(1, 2)']['classification_report']['accuracy']}')
    print(f'{trained_models[f'{key}_count_(1, 2)']['classification_report']['1']}')
    print('')

logreg
accuracy: 0.958203125
{'precision': 0.9643705463182898, 'recall': 0.9515625, 'f1-score': 0.9579237121510028, 'support': 1280.0}

nbayes
accuracy: 0.94140625
{'precision': 0.9366306027820711, 'recall': 0.946875, 'f1-score': 0.9417249417249417, 'support': 1280.0}

xgboost
accuracy: 0.95546875
{'precision': 0.9619651347068146, 'recall': 0.9484375, 'f1-score': 0.955153422501967, 'support': 1280.0}



# comparing wrong predictions

In [22]:
# Filter only the models with CountVectorizer and (1,2) n-gram
target_models = ['logreg_count_(1, 2)', 'nbayes_count_(1, 2)', 'xgboost_count_(1, 2)']

# Extract predictions and true labels
preds = [trained_models[model_key]['y_pred'] for model_key in target_models]
true_labels = trained_models[target_models[0]]['y_true']  # same for all

# Convert to numpy array for vectorized comparison
preds_array = np.array(preds)  # shape: (3, num_samples)
true_array = np.array(true_labels)  # shape: (num_samples,)

# Transpose preds_array so that shape becomes (num_samples, 3)
preds_array = preds_array.T

# Now compare
correct_by_all = np.all(preds_array == true_array[:, None], axis=1)
wrong_by_all = np.all(preds_array != true_array[:, None], axis=1)
disagreement = ~(correct_by_all | wrong_by_all)

# Count each category
num_correct_all = np.sum(correct_by_all)
num_wrong_all = np.sum(wrong_by_all)
num_disagreement = np.sum(disagreement)

print(f" Correct by all three: {num_correct_all}")
print(f" Misclassified by all three: {num_wrong_all}")
print(f" Disagreement among models: {num_disagreement}")



 Correct by all three: 2322
 Misclassified by all three: 42
 Disagreement among models: 196


# Testing with texts generated by other LLM-s

In [16]:
llm_data = pd.read_csv("other_llm_test_data/llm_final_data_2.csv", sep=";", encoding='cp1252')

In [17]:
llm_data['clean_text'] = llm_data['text'].astype(str).apply(preprocess_text)

In [18]:
llm_data.clean_text

0      propargyl radical c3h3 represent critical reac...
1      study address methodological challenge estimat...
2      study investigate intermittency phenomenon two...
3      comprehensive study investigate complex phenom...
4      research present comprehensive computational m...
                             ...                        
115    study present exhaustive analysis charge occup...
116    report multiple crossing landau level double h...
117    study investigate role nonorthogonal wavefunct...
118    proliferation networked technology give rise u...
119    present first deep observation local interstel...
Name: clean_text, Length: 120, dtype: object

In [19]:
import pandas as pd
from sklearn.metrics import classification_report

model_keys = ['logreg_count_(1, 2)', 'nbayes_count_(1, 2)', 'xgboost_count_(1, 2)']

for key in model_keys:


    pipeline_llm = trained_models[key]['pipeline']


    llm_data['clean_text'] = llm_data['clean_text'].fillna("").astype(str)
    preds = pipeline_llm.predict(llm_data['clean_text'])
    probs = pipeline_llm.predict_proba(llm_data['clean_text'])

    llm_data[f'predicted_label_{key}'] = preds
    llm_data[f'prediction_prob_{key}'] = probs.max(axis=1) 


    #print(key)
    #print(classification_report(llm_data['label'], preds))


In [20]:
llm_data.groupby('author')['predicted_label_logreg_count_(1, 2)'].value_counts(normalize=True).unstack()

"predicted_label_logreg_count_(1, 2)",0,1
author,Unnamed: 1_level_1,Unnamed: 2_level_1
Claude 3.5 Haiku,,1.0
DeepSeek-V3,0.333333,0.666667
Gemini 2.0 Flash,0.033333,0.966667
llama3.2,0.066667,0.933333


In [21]:
llm_data.groupby('author')['predicted_label_nbayes_count_(1, 2)'].value_counts(normalize=True).unstack()

"predicted_label_nbayes_count_(1, 2)",0,1
author,Unnamed: 1_level_1,Unnamed: 2_level_1
Claude 3.5 Haiku,,1.0
DeepSeek-V3,0.633333,0.366667
Gemini 2.0 Flash,0.033333,0.966667
llama3.2,0.066667,0.933333


In [None]:
llm_data.groupby('author')['predicted_label_xgboost_count_(1, 2)'].value_counts(normalize=True).unstack()

"predicted_label_xgboost_count_(1, 2)",0,1
author,Unnamed: 1_level_1,Unnamed: 2_level_1
Claude 3.5 Haiku,,1.0
DeepSeek-V3,0.3,0.7
Gemini 2.0 Flash,0.166667,0.833333
llama3.2,0.1,0.9


In [None]:
from collections import Counter
import numpy as np

llms = llm_data.author.unique().tolist()
top_ngrams = pd.DataFrame()

for llm in llms:
    # Get the correct vectorizer from your trained pipeline
    vectorizer = trained_models[key]['pipeline'].named_steps['vectorizer']

    # Transform your data into document-term matrix
    X_vec = vectorizer.transform(llm_data[llm_data.author == llm]['clean_text'])

    # Sum frequencies across all documents
    token_counts = np.array(X_vec.sum(axis=0)).flatten()

    # Map feature indices to actual n-gram strings
    vocab = vectorizer.get_feature_names_out()
    ngram_freq = list(zip(vocab, token_counts))

    # Sort by frequency
    ngram_freq_sorted = sorted(ngram_freq, key=lambda x: x[1], reverse=True)

    top_ngrams[llm] = ngram_freq_sorted[:30]
    # Display top N
    top_n = 20
    #for ngram, count in ngram_freq_sorted[:top_n]:
        #print(f"{ngram}: {int(count)}")

In [None]:
top_ngrams.iloc[:30]

Unnamed: 0,Claude 3.5 Haiku,DeepSeek-V3,Gemini 2.0 Flash,llama3.2
0,"(quantum, 40)","(system, 28)","(galaxy, 30)","(galaxy, 36)"
1,"(complex, 33)","(provide, 26)","(provide, 24)","(demonstrate, 27)"
2,"(technique, 33)","(demonstrate, 25)","(gas, 23)","(study, 25)"
3,"(mechanism, 32)","(model, 25)","(investigate, 20)","(system, 25)"
4,"(provide, 31)","(result, 24)","(property, 20)","(result, 24)"
5,"(analysis, 30)","(show, 24)","(potential, 19)","(data, 21)"
6,"(system, 30)","(analysis, 19)","(study, 19)","(structure, 21)"
7,"(demonstrate, 29)","(galaxy, 19)","(analyze, 18)","(model, 20)"
8,"(dynamic, 29)","(framework, 18)","(distribution, 17)","(novel, 20)"
9,"(galaxy, 29)","(reveal, 18)","(molecular, 17)","(present, 20)"
