In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your dataset
file_path = '/content/preprocessed_combined_file (1).xlsx'
df = pd.read_excel(file_path)

# Split dataset into features and labels
X = df['Text']
y = df['Tag']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)  # Vectorize the test set correctly

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Convert TF-IDF matrix back to text (approximation)
feature_names = vectorizer.get_feature_names_out()
X_train_resampled_text = [" ".join([feature_names[i] for i in x.nonzero()[1]]) for x in X_train_resampled]
X_test_text = [" ".join([feature_names[i] for i in x.nonzero()[1]]) for x in X_test_tfidf]

# Convert to Hugging Face Dataset format
train_df_resampled = pd.DataFrame({'Text': X_train_resampled_text, 'Tag': y_train_resampled})
test_df = pd.DataFrame({'Text': X_test_text, 'Tag': y_test.reset_index(drop=True)})

train_dataset = Dataset.from_pandas(train_df_resampled)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the Dataset
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['Text'], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["Text"])
test_dataset = test_dataset.remove_columns(["Text"])

# Rename the 'Tag' column to 'labels' in your datasets
train_dataset = train_dataset.rename_column('Tag', 'labels')
test_dataset = test_dataset.rename_column('Tag', 'labels')

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Load GPT-2 model
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2, pad_token_id=tokenizer.eos_token_id)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Updated to eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define the metrics
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(labels, pred)
    precision = precision_score(labels, pred)
    recall = recall_score(labels, pred)
    f1 = f1_score(labels, pred)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print evaluation metrics
for key, value in results.items():
    print(f"{key}: {value:.4f}")

# Save the model
model.save_pretrained("/content/gpt2-Text-classification")
tokenizer.save_pretrained("/content/gpt2-Text-classification")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/13490 [00:00<?, ? examples/s]

Map:   0%|          | 0/2636 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6121,4.824558,0.379363,0.365949,0.983176,0.533371
2,0.5621,4.582734,0.397572,0.368984,0.943218,0.530455
3,0.377,6.020714,0.394917,0.369741,0.961094,0.534034


eval_loss: 6.0207
eval_accuracy: 0.3949
eval_precision: 0.3697
eval_recall: 0.9611
eval_f1: 0.5340
eval_runtime: 18.9660
eval_samples_per_second: 138.9850
eval_steps_per_second: 17.4000
epoch: 3.0000


('/content/gpt2-Text-classification/tokenizer_config.json',
 '/content/gpt2-Text-classification/special_tokens_map.json',
 '/content/gpt2-Text-classification/vocab.json',
 '/content/gpt2-Text-classification/merges.txt',
 '/content/gpt2-Text-classification/added_tokens.json')

In [4]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch

# Load the trained model and tokenizer
model_path = "/content/gpt2-Text-classification"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2ForSequenceClassification.from_pretrained(model_path)

# Set the model in evaluation mode
model.eval()


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [6]:
def predict_unseen_Texts(Texts, tokenizer, model):
    # Tokenize the input Texts
    inputs = tokenizer(Texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    return predictions

# Example unseen input Texts in Urdu
unseen_Texts = [
    "یہ ایک ٹیسٹ ٹویٹ ہے",
    "گانڈو ہم پھاڑتے ہیں ہماری کون پھاڑے گا تم لوگوں کی گانڈ میں اتنا دم نہیں زہنی مریض غلام نواز کے چوتڑ چاٹ یہاں کیا بک بک کرنے آگیا ہے",
    "موسم بہت اچھا ہے آج"
]

# Predict labels for the unseen Texts
predicted_labels = predict_unseen_Texts(unseen_Texts, tokenizer, model)

# Print the results
for Text, label in zip(unseen_Texts, predicted_labels):
    print(f"Text: {Text} -> Predicted Label: {label.item()}")


Text: یہ ایک ٹیسٹ ٹویٹ ہے -> Predicted Label: 0
Text: گانڈو ہم پھاڑتے ہیں ہماری کون پھاڑے گا تم لوگوں کی گانڈ میں اتنا دم نہیں زہنی مریض غلام نواز کے چوتڑ چاٹ یہاں کیا بک بک کرنے آگیا ہے -> Predicted Label: 1
Text: موسم بہت اچھا ہے آج -> Predicted Label: 0
