# Set Environment

In [1]:
%mkdir neshan
! pip install virtualenv > /dev/null
! virtualenv neshan  > /dev/null
! neshan/bin/pip install hazm > /dev/null

In [2]:
! neshan/bin/pip install transformers > /dev/null

In [3]:
! pip install accelerate==0.20.3  > /dev/null

In [4]:
! neshan/bin/pip install transformers[torch] > /dev/null

In [5]:
! neshan/bin/pip install accelerate -U > /dev/null

In [8]:
! pip install sentence-transformers  > /dev/null

In [21]:
import sys
sys.path.append("neshan/lib/python3.10/site-packages")


import hazm
import torch
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from hazm import Normalizer, Stemmer, word_tokenize, stopwords_list
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

warnings.filterwarnings("ignore")

In [None]:
! neshan/bin/pip list

Package                  Version
------------------------ ----------
accelerate               0.23.0
certifi                  2023.7.22
charset-normalizer       3.2.0
click                    8.1.7
cmake                    3.27.4.1
fasttext-wheel           0.9.2
filelock                 3.12.4
fsspec                   2023.9.1
gensim                   4.3.2
hazm                     0.9.3
huggingface-hub          0.17.1
idna                     3.4
Jinja2                   3.1.2
joblib                   1.3.2
lit                      16.0.6
MarkupSafe               2.1.3
mpmath                   1.3.0
networkx                 3.1
nltk                     3.8.1
numpy                    1.26.0
nvidia-cublas-cu11       11.10.3.66
nvidia-cuda-cupti-cu11   11.7.101
nvidia-cuda-nvrtc-cu11   11.7.99
nvidia-cuda-runtime-cu11 11.7.99
nvidia-cudnn-cu11        8.5.0.96
nvidia-cufft-cu11        10.9.0.58
nvidia-curand-cu11       10.2.10.91
nvidia-cusolver-cu11     11.4.0.1
nvidia-cusparse-cu11     

# Read/Prepare Dataset

In [7]:
! unzip /content/drive/MyDrive/Data\ \Scientist/nlp/project_1_data/hamshahri_dataset.zip > /dev/null

## Using BERT as Feature Extractor

In [None]:
normalizer = Normalizer()
stemmer = Stemmer()
stopwords = stopwords_list()

def preprocess_text(text):
    text = normalizer.normalize(text)
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(token) for token in tokens if token not in stopwords]
    return ' '.join(tokens)

In [None]:
df = pd.read_csv('/content/hamshahri_dataset/train.csv')
df.rename(columns={'CAT[2]/text()': 'category',
                   'TEXT[1]/text()': 'text', 'Unnamed: 0': 'id'}, inplace=True)
df.drop(columns=['id'], inplace=True)

df['processed_text'] = df['text'].apply(preprocess_text)

In [None]:
model_name = "bert-base-multilingual-cased"
model = SentenceTransformer(model_name)

embeddings = model.encode(df['processed_text'].tolist(), show_progress_bar=True)

X_train, X_test, y_train, y_test = train_test_split(embeddings, df['category'],
                                                    test_size=0.2,
                                                    random_state=42)

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))



Batches:   0%|          | 0/2005 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                     precision    recall  f1-score   support

            Economy       0.92      0.92      0.92      3221
 Literature and Art       0.84      0.81      0.82       833
           Politics       0.85      0.89      0.87      3031
Science and Culture       0.80      0.81      0.81      1811
             Social       0.77      0.69      0.73      1453
              Sport       0.99      0.98      0.98      2479

           accuracy                           0.88     12828
          macro avg       0.86      0.85      0.85     12828
       weighted avg       0.88      0.88      0.88     12828



## Test on real test set!

In [None]:
test_df = pd.read_csv('/content/hamshahri_dataset/test.csv')
test_df.rename(columns={'CAT[2]/text()': 'category',
                   'TEXT[1]/text()': 'text', 'Unnamed: 0': 'id'}, inplace=True)
test_df.drop(columns=['id'], inplace=True)

test_df['processed_text'] = test_df['text'].apply(preprocess_text)

test_embeddings = model.encode(test_df['processed_text'].tolist(), s
                               how_progress_bar=True)

test_predictions = classifier.predict(test_embeddings)
print(classification_report(test_df['category'], test_predictions))

Batches:   0%|          | 0/502 [00:00<?, ?it/s]

                     precision    recall  f1-score   support

            Economy       0.92      0.92      0.92      3967
 Literature and Art       0.82      0.81      0.81      1037
           Politics       0.86      0.88      0.87      3788
Science and Culture       0.79      0.82      0.80      2336
             Social       0.75      0.67      0.71      1776
              Sport       0.99      0.98      0.98      3132

           accuracy                           0.87     16036
          macro avg       0.85      0.85      0.85     16036
       weighted avg       0.87      0.87      0.87     16036



# Fine-tune

In [15]:
df['category'].unique()

array(['Literature and Art', 'Sport', 'Social', 'Economy', 'Politics',
       'Science and Culture'], dtype=object)

In [35]:
df = pd.read_csv('/content/hamshahri_dataset/train.csv')
df.rename(columns={'CAT[2]/text()': 'category', 'TEXT[1]/text()': 'text',
                   'Unnamed: 0': 'id'}, inplace=True)

df.drop(columns=['id'], inplace=True)
df = df.sample(frac=0.2, random_state=1)     # Sampling? :)
df['category'] = df['category'].replace(['Literature and Art', 'Sport',
                                         'Social', 'Economy', 'Politics',
                                         'Science and Culture'],
                                          [0, 1, 2, 3, 4, 5])

X_train = df['text']
y_train = df['category']
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased",
                                          do_lower_case=False)

X_train_encoded = tokenizer(list(X_train), truncation=True, padding=True,
                            return_tensors="pt", max_length=128)
y_train = torch.tensor(y_train.tolist())

train_dataset = TensorDataset(X_train_encoded["input_ids"],
                              X_train_encoded["attention_mask"], y_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [37]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased",
                                                      num_labels=len(df['category'].unique()))

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader) * 3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print(f"\nEpoch {epoch + 1}, Loss: {total_loss}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 802/802 [05:14<00:00,  2.55it/s]


Epoch 1, Loss: 403.0851493421942


Epoch 2: 100%|██████████| 802/802 [05:23<00:00,  2.48it/s]


Epoch 2, Loss: 210.9157286696136


Epoch 3: 100%|██████████| 802/802 [05:21<00:00,  2.50it/s]


Epoch 3, Loss: 143.4076062082313


Epoch 4: 100%|██████████| 802/802 [05:21<00:00,  2.50it/s]


Epoch 4, Loss: 109.94394757272676


Epoch 5: 100%|██████████| 802/802 [05:21<00:00,  2.50it/s]

Epoch 5, Loss: 111.75838274881244





In [27]:
! nvidia-smi

Sun Sep 17 14:39:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    33W /  70W |  13639MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [24]:
torch.cuda.empty_cache()

# Now, let's test our model!

In [44]:
test_df = pd.read_csv('/content/hamshahri_dataset/test.csv')
test_df.rename(columns={'CAT[2]/text()': 'category',
                   'TEXT[1]/text()': 'text', 'Unnamed: 0': 'id'}, inplace=True)
test_df.drop(columns=['id'], inplace=True)
test_df['category'] = test_df['category'].replace(['Literature and Art', 'Sport', 'Social', 'Economy', 'Politics', 'Science and Culture'],[0, 1, 2, 3, 4, 5])
test_texts = test_df['text'].tolist()
y_test = test_df['category']
y_test = torch.tensor(y_test.tolist())

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", do_lower_case=False)
test_encoded = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt", max_length=128)
test_dataset = TensorDataset(test_encoded["input_ids"], test_encoded["attention_mask"], y_test)

In [45]:
test_batch_size = 4
test_loader = DataLoader(test_dataset, batch_size=test_batch_size)

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Inference"):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1).cpu().numpy()

        predictions.extend(batch_predictions)
        true_labels.extend(labels.cpu().numpy())

        torch.cuda.empty_cache()

print(classification_report(true_labels, predictions))

Inference: 100%|██████████| 4009/4009 [02:15<00:00, 29.68it/s]


              precision    recall  f1-score   support

           0       0.80      0.90      0.85      1037
           1       0.99      0.98      0.98      3132
           2       0.82      0.82      0.82      1776
           3       0.94      0.96      0.95      3967
           4       0.95      0.91      0.93      3788
           5       0.87      0.85      0.86      2336

    accuracy                           0.92     16036
   macro avg       0.90      0.90      0.90     16036
weighted avg       0.92      0.92      0.92     16036

