### Allows permisions to be set on a file or directory

This is a simple implementation of the `chmod` command in Unix. It allows the user to set the permissions of a file or directory using the octal notation.

```bash
sudo chmod -R 777 /path/to/file
```

In [1]:
import sys
import gc # garbage collector 

import pandas as pd 
from sklearn.model_selection import StratifiedKFold
"""
What is the difference between KFold, StratifiedKFold and StratifiedGroupKFold?
https://youtu.be/PF2wLKv2lsI?si=RNC5gu6J7Y_UIWR5
"""

import numpy as np 
from sklearn.metrics import roc_auc_score 
from sklearn.feature_extraction.text import TfidfVectorizer 

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm 
from transformers import PreTrainedTokenizerFast 
from sklearn.linear_model import SGDClassifier 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.ensemble import VotingClassifier

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
test = pd.read_csv('../text_generation_detection/data/test_essays.csv')
sub = pd.read_csv('../text_generation_detection/data/sample_submission.csv')
org_train = pd.read_csv('../text_generation_detection/data/train_essays.csv')
train = pd.read_csv('../text_generation_detection/data/train_v2_drcat_02.csv')

In [3]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [4]:
test.text.values

array(['Aaa bbb ccc.', 'Bbb ccc ddd.', 'CCC ddd eee.'], dtype=object)

### Data processing

In [5]:
LOWERCASE = False
VOCAB_SIZE = 30522

##### TqdmWarning: This warning indicates that the IProgress module is not found. It suggests updating Jupyter and ipywidgets. 

To address this warning, you can follow these steps:

```bash
1. !pip install --upgrade jupyter
2. !pip install --upgrade ipywidgets
3. !pip install ipywidgets>=7.6
4. !pip install IProgress
```

In [6]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Adding normalization and pre-tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() 

# Adding special tokens and creating trainer instance 
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"] 
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

# Creating hugginface dataset object 
dataset = Dataset.from_pandas(test[['text']])

# print(dataset)

def train_corp_iter():
    """
    A generation function for iterating over a dataset in chunks
    """
    for i in range(0, len(dataset), 1000):
        yield dataset[i:i+1000]["text"]

# Training from iterator REMEMBER it's training on test set ...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []

# Tokenize test set with new tokenizer
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

# Tokenize train set
tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [7]:
tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=35, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
tokenized_texts_test

[['ĠAaa', 'Ġbbb', 'Ġccc', '.'],
 ['ĠBbb', 'Ġccc', 'Ġddd', '.'],
 ['ĠCCC', 'Ġddd', 'Ġeee', '.']]

In [9]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. It returns the text
    as it is since we already tokenized it.
    """
    return text

In [10]:
# Fitting TfidfVectorizer on test set
vectorizer = TfidfVectorizer(ngram_range=(3, 5),
                             lowercase=False,
                             sublinear_tf=True,
                             analyzer='word',
                             tokenizer = dummy,
                             preprocessor = dummy,
                             token_pattern = None,
                             strip_accents='unicode'
                             )

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


In [12]:
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                             analyzer='word',
                             tokenizer=dummy,
                             preprocessor=dummy,
                             token_pattern=None,
                             strip_accents='unicode'
                             )
tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer # deleting vectorizer to free up memory
gc.collect() # garbage collector

568

In [20]:
y_train = train['label'].values
y_train

array([0, 0, 0, ..., 1, 1, 1])

In [16]:
tf_train.shape 

(44868, 9)

In [17]:
tf_test.shape

(3, 9)

In [21]:
bayes_model = MultinomialNB(alpha=0.02)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber")
ensemble = VotingClassifier(estimators=[('sgd', sgd_model), ('nb', bayes_model)],
                            weights=[0.7, 0.3], voting='soft',
                            n_jobs=-1)
ensemble.fit(tf_train, y_train)

gc.collect()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

537

In [24]:
final_preds = ensemble.predict_proba(tf_test)[:, 1] # [:, 1] to get the probability of class 1 (essay is generated)
sub['generated'] = final_preds
sub.to_csv('submission.csv', index=False)

sub

Unnamed: 0,id,generated
0,0000aaaa,0.381286
1,1111bbbb,0.381286
2,2222cccc,0.381286
