### Allows permisions to be set on a file or directory

This is a simple implementation of the `chmod` command in Unix. It allows the user to set the permissions of a file or directory using the octal notation.

```bash
sudo chmod -R 777 /path/to/file
```

In [1]:
import sys
import gc # garbage collector 

import pandas as pd 
from sklearn.model_selection import StratifiedKFold
"""
What is the difference between KFold, StratifiedKFold and StratifiedGroupKFold?
https://stackoverflow.com/questions/45969390/difference-between-kfold-stratifiedkfold-and-stratifiedgroupkfold-in-sklearn

StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

StratifiedGroupKFold is a variation of k-fold which returns stratified folds, but in addition, the folds are approximately balanced between the different groups.
"""
import numpy as np 
from sklearn.metrics import roc_auc_score 
from sklearn.feature_extraction.text import TfidfVectorizer 

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm 
from transformers import PreTrainedTokenizerFast 
from sklearn.linear_model import SGDClassifier 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.ensemble import VotingClassifier

In [2]:
test = pd.read_csv('../text_generation_detection/data/test_essays.csv')
sub = pd.read_csv('../text_generation_detection/data/sample_submission.csv')
org_train = pd.read_csv('../text_generation_detection/data/train_essays.csv')
train = pd.read_csv('../text_generation_detection/data/train_v2_drcat_02.csv')

In [3]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [4]:
test.text.values

array(['Aaa bbb ccc.', 'Bbb ccc ddd.', 'CCC ddd eee.'], dtype=object)

### Data processing

In [5]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [10]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Adding normalization and pre-tokenizer
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() 

# Adding special tokens and creating trainer instance 
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"] 
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

# Creating hugginface dataset object 
dataset = Dataset.from_pandas(test[['text']])

# print(dataset)

def train_corp_iter():
    """
    A generation function for iterating over a dataset in chunks
    """
    for i in range(0, len(dataset), 1000):
        yield dataset[i:i+1000]["text"]

# Training from iterator REMEMBER it's training on test set ...
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []

# Tokenize test set with new tokenizer
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

# Tokenize train set
tokenized_texts_train = []

for i in tqdm(range(0, len(train['text']), 1000)):
    batch_texts = train['text'].iloc[i:i+1000].tolist()
    tokenized_texts_train.extend(tokenizer.tokenize(text) for text in batch_texts)






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]