Main TFIDF idea and ngrams setup taken from: https://www.kaggle.com/code/hubert101/0-930-phrases-are-keys/notebook

Dataset
https://www.kaggle.com/datasets/narsil/daigt-misc


# Importing library

In [1]:
!pip install /kaggle/input/pyspellchecker3/pyspellchecker-0.7.2-py3-none-any.whl

Processing /kaggle/input/pyspellchecker3/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [2]:
import numpy as np
import pandas as pd
import string
from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier

# Importing files and Feature Engineering

In [3]:
train = pd.read_csv("/kaggle/input/train-v2-drcat-02-2/train_v2_drcat_02.csv", sep=',')
train1 = train[train.label == 1].sample(8050)
train = train[train.RDizzl3_seven == True].reset_index(drop=True)
train = pd.concat([train, train1])
print(train.label.value_counts())

train['text'] = train['text'].str.replace('\n', '') 

test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test['text'] = test['text'].str.replace('\n', '') 

df = pd.concat([train['text'], test['text']]).reset_index(drop=True)

label
0    14250
1    14250
Name: count, dtype: int64


## TFIDF

In [4]:
vectorizer1 = TfidfVectorizer(ngram_range=(3, 5), sublinear_tf=True)
X = vectorizer1.fit_transform(train['text'])

# Defining models

In [5]:
sgd_model = SGDClassifier(max_iter=1000, tol=1e-3, loss="modified_huber")
lr_model = LogisticRegression(solver="liblinear")

# Voting Classifier

In [6]:
ensemble = VotingClassifier(estimators=[('lr', lr_model),('sgd', sgd_model)], voting='soft')
ensemble.fit(X, train.label)

In [7]:
preds = ensemble.predict_proba(vectorizer1.transform(test['text']))[:,1]

## Spelling model

### Spelling mistakes and syllable count

In [8]:
spell = SpellChecker()
def count_spelling_mistakes(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    

    words = sentence.split()
    misspelled = spell.unknown(words)
    num_mistakes = len(misspelled)
    
    return num_mistakes

tqdm.pandas()
count_mistakes = train.text.progress_apply(count_spelling_mistakes)
mistakes = np.log1p(count_mistakes)
mistakes.reset_index(drop=True, inplace=True)

  0%|          | 0/28500 [00:00<?, ?it/s]

In [9]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

#Returns the number of words that have more then n syllables
#Idea is to find the complicated words and hope that ai uses more complex words
def nr_syllables_lr(n, text):
    return len([syllable_count(word) for word in text.split() if syllable_count(word) >= n])

#In this case we want to find the percentege od 2 syllbale words in a text
def procent_small_syllables(text):
    return len([syllable_count(word) for word in text.split() if syllable_count(word) == 2]) / len(text.split())




In [10]:
tqdm.pandas()
count_syllables = train.text.progress_apply(lambda text: nr_syllables_lr(4, text))
count_syllables.reset_index(drop=True, inplace=True)

small_words = train.text.apply(procent_small_syllables)
small_words.reset_index(drop=True, inplace=True)

  0%|          | 0/28500 [00:00<?, ?it/s]

In [11]:
train2 = pd.DataFrame({'mistakes':mistakes, 'largeWords':count_syllables, 'smallWords':small_words})

In [12]:
lr2 = LogisticRegression()
lr2.fit(train2, train.label)

In [13]:
test_mistakes = test.text.apply(count_spelling_mistakes)
test_large_words = test.text.apply(lambda text: nr_syllables_lr(4, text) )
test_small_words = test.text.apply(procent_small_syllables)

mistake_pred = lr2.predict_proba(pd.DataFrame({'mistakes':test_mistakes, 'largeWords':test_large_words, 'smallWords':test_small_words}))[:,1]

## Combining the results

In [14]:
final_pred = 0.67 * preds + 0.33 * mistake_pred

In [15]:
pd.DataFrame({'id':test["id"],'generated':final_pred}).to_csv('submission.csv', index=False)