In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
from transformers import AutoTokenizer

tr_tokenizer = AutoTokenizer.from_pretrained("alibayram/tr_tokenizer")
tr_tokenizer.is_fast



True

In [4]:
tr_embeddings = []

for i in range(3):
    tr_embeddings.append(torch.load(f'tr_cosmos_embeddings_{i}.pt'))

tr_embeddings = torch.cat(tr_embeddings)

print(tr_embeddings.shape)

torch.Size([30158, 768])


  tr_embeddings.append(torch.load(f'tr_cosmos_embeddings_{i}.pt'))


In [25]:
embedding = nn.Embedding.from_pretrained(tr_embeddings)
embedding

Embedding(30158, 768)

In [55]:
embedding(torch.tensor([0, 1, 2, 3, 4])).shape

torch.Size([5, 768])

In [51]:
class TextClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(tr_embeddings)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        # average = embedded.mean(dim=0)
        output, _ = self.rnn(embedded)
        last_hidden = output[:, -1, :]
        logits = self.fc(last_hidden)
        return logits

In [52]:
# Training parameters

num_classes = 2
embedding_dim = 768
hidden_dim = 256
num_epochs = 5
batch_size = 32
learning_rate = 0.001

# Model, loss function, and optimizer
model = TextClassifier(embedding_dim, hidden_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [1]:
import pandas as pd
from codes.pre_processor import PreProcessor

df = pd.read_csv('new_df.csv')

preprocessor = PreProcessor(df)
df = preprocessor.preprocess()
df

  df.loc[df['doctor_speciality'].isin(speciality_mapping[key]), 'doctor_speciality'] = key
  df.loc[df['doctor_title'].isin(title_mapping[key]), 'doctor_title'] = key


Unnamed: 0,doctor_title,doctor_speciality,text
0,7,0,"Merhaba, Yaklaşık 4 aydır sağ omuzumda kolumu ..."
1,6,0,Hocam merhaba 29 yaşında erkek hasta hareketle...
2,6,0,"Meraba hocam sorum şu , bir erkek bayıltılıp a..."
3,6,1,Merhaba doktor hanim ben 3 yildir evliyim. Hic...
4,6,1,Merhaba.. Benim size sorum olacakti .. Ben 3 s...
...,...,...,...
124743,6,0,Merhaba L3 4 L4 5 duzeyınde anular buldıng ızl...
124744,6,1,"hocam merhaba , son kontrolüme göre gebelik ha..."
124745,6,0,Hocam 15 yildir basur sikintisi çeken biriyim ...
124746,6,1,hocam öncelikle meraba ortalama 3 yıl önce kad...


In [4]:
df['doctor_speciality'].value_counts().to_dict()

{1: 67286, 0: 57462}

In [34]:
df['text'] = df['question_content'] + " " + df['question_answer']
df = df.drop(['question_content', 'question_answer'], axis=1)
# change column name from doctor_speciality to label
df = df.rename(columns={'doctor_speciality': 'label'})
df

Unnamed: 0,doctor_title,label,text
0,7,0,"Merhaba, Yaklaşık 4 aydır sağ omuzumda kolumu ..."
1,6,0,Hocam merhaba 29 yaşında erkek hasta hareketle...
2,6,0,"Meraba hocam sorum şu , bir erkek bayıltılıp a..."
3,6,1,Merhaba doktor hanim ben 3 yildir evliyim. Hic...
4,6,1,Merhaba.. Benim size sorum olacakti .. Ben 3 s...
...,...,...,...
124743,6,0,Merhaba L3 4 L4 5 duzeyınde anular buldıng ızl...
124744,6,1,"hocam merhaba , son kontrolüme göre gebelik ha..."
124745,6,0,Hocam 15 yildir basur sikintisi çeken biriyim ...
124746,6,1,hocam öncelikle meraba ortalama 3 yıl önce kad...


In [40]:
tr_tokenizer.encode("Ali Bayram")

[1402, 78, 27876]

In [45]:
# Creating Dataset objects
class TextDataset(Dataset):
    def __init__(self, df, indices):
        self.df = df
        self.indices = indices
        
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        row = self.df.iloc[self.indices[idx]]
        print(row['text'])
        return torch.tensor(tr_tokenizer.encode(row['text']))


In [36]:
from sklearn.model_selection import train_test_split

# Splitting the indices for train and validation sets
train_indices, val_indices = train_test_split(range(len(df)), test_size=0.2, random_state=42)


In [47]:

train_dataset = TextDataset(df, train_indices)
val_dataset = TextDataset(df, val_indices)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=batch_size)


In [54]:
tokens = tr_tokenizer.encode("Ali Bayram")
outputs = model(tokens)
outputs

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

In [39]:
# Iterate over the training data for the specified number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_samples = 0
    for inputs in train_loader:
        optimizer.zero_grad()
        print(inputs)
        inputs = torch.LongTensor(inputs)
        targets = inputs.clone()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, num_classes), targets.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(inputs)
        total_samples += len(inputs)

    # Evaluate on the validation set after every epoch
    model.eval()
    total_val_loss = 0.0
    total_val_samples = 0
    with torch.no_grad():
        for inputs in valid_loader:
            inputs = torch.LongTensor(inputs)
            targets = inputs.clone()
            outputs = model(inputs)
            val_loss = criterion(outputs.view(-1, num_classes), targets.view(-1))

            total_val_loss += val_loss.item() * len(inputs)
            total_val_samples += len(inputs)

    avg_loss = total_loss / total_samples
    avg_val_loss = total_val_loss / total_val_samples

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

RuntimeError: each element in list of batch should be of equal size