In [None]:
!pip uninstall torch -y
!pip cache purge  #cleans cache
!pip install --upgrade torch torchvision torchaudio

Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Successfully uninstalled torch-2.6.0
Files removed: 10
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl (766.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch
Successfully installed torch-2.6.0


In [None]:
# Install PyTorch libraries (if not already installed)
try:
    import torch
except ImportError:
    !pip install --upgrade torch torchvision torchaudio

## Output-1

In [None]:
import torch
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Load data
df = pd.read_csv("translated_output_1.csv")  # Replace with actual path

# Initialize stopwords & lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Custom tokenizer
def custom_tokenizer(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

# Preprocess text
df['Comment_en'] = df['Comment_en'].fillna("<UNK>").apply(lambda x: f"<s> {x} </s>")

# Vectorize using TF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, token_pattern=None)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Comment_en'])

# Convert TF-IDF matrix to PyTorch tensor
tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)

# Extract labels and Unnamed: 0
labels = df['CommentClass_en'].tolist()  # Keep as list of strings or convert if needed
unnamed_col = df['Unnamed: 0'].tolist()

# Bundle into a dictionary
data_bundle = {
    'features': tfidf_tensor,
    'labels': labels,
    'index': unnamed_col
}

# Save to .pt file
torch.save(data_bundle, 'tfidf_output_1.pt')
print("Saved TF-IDF data as PyTorch .pt file!")

# Save the TF-IDF vectorizer model separately
torch.save(tfidf_vectorizer, 'tfidf_model_1.pt')
print("Saved TF-IDF vectorizer model as tfidf_model_1.pt")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saved TF-IDF data as PyTorch .pt file!
Saved TF-IDF vectorizer model as tfidf_model_1.pt


## Output-2

In [None]:
import torch
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Load data
df = pd.read_csv("translated_output_2.csv")  # Replace with actual path

# Initialize stopwords & lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Custom tokenizer
def custom_tokenizer(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

# Preprocess text
df['Comment_en'] = df['Comment_en'].fillna("<UNK>").apply(lambda x: f"<s> {x} </s>")

# Vectorize using TF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, token_pattern=None)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Comment_en'])

# Convert TF-IDF matrix to PyTorch tensor
tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32)

# Extract labels and Unnamed: 0
labels = df['CommentClass_en'].tolist()  # Keep as list of strings or convert if needed
unnamed_col = df['Unnamed: 0'].tolist()

# Bundle into a dictionary
data_bundle = {
    'features': tfidf_tensor,
    'labels': labels,
    'index': unnamed_col
}

# Save to .pt file
torch.save(data_bundle, 'tfidf_output_2.pt')
print("Saved TF-IDF data as PyTorch .pt file!")

# Save the TF-IDF vectorizer model separately
torch.save(tfidf_vectorizer, 'tfidf_model_2.pt')
print("Saved TF-IDF vectorizer model as tfidf_model_2.pt")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saved TF-IDF data as PyTorch .pt file!
Saved TF-IDF vectorizer model as tfidf_model_2.pt


## SBERT

In [1]:
!pip install datasets
!pip install -U sentence-transformers
!pip install -U huggingface_hub

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

## Output-1

In [4]:
import pandas as pd
import torch
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import random
from datasets import Dataset
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv("translated_output_1.csv")  # Replace with your CSV filename

# Preprocess text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['Comment_en'].astype(str).apply(preprocess_text)
df['CommentClass_en'] = df['CommentClass_en'].apply(eval)
df['label'] = df['CommentClass_en'].apply(lambda x: x[0] if len(x) > 0 else "Unknown")

# Generate positive pairs
input_examples = []
for label, group in df.groupby('label'):
    texts = group['cleaned_text'].tolist()
    # if len(texts) < 2:
    #     continue
    random.shuffle(texts)
    for i in range(0, len(texts) - 1, 2):
        input_examples.append(InputExample(texts=[texts[i], texts[i+1]]))

print(f"Created {len(input_examples)} training pairs")

# Load base SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Fine-tune model
train_dataloader = DataLoader(input_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=20,
          warmup_steps=100,
          show_progress_bar=True)

# Save fine-tuned model
model.save("sbert_model_1")
torch.save(model, 'sbert_model_1.pt')
print("Saved fine-tuned model to 'sbert_model_1.pt'")

# Encode all comments using fine-tuned model
all_embeddings = model.encode(df['cleaned_text'].tolist(), convert_to_tensor=True)

# Bundle into dictionary and save as .pt
data_bundle = {
    'features': all_embeddings,
    'labels': df['CommentClass_en'].tolist(),
    'index': df['Unnamed: 0'].tolist()
}

torch.save(data_bundle, 'sbert_output_1.pt')
print("Saved embeddings to 'sbert_output_1.pt'")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Created 6497 training pairs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkritimadumadukala[0m ([33mkritimadumadukala-iiit-hyderabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,2.5416
1000,2.1138
1500,2.0131
2000,1.9452
2500,1.8668
3000,1.8092
3500,1.7689
4000,1.7091
4500,1.6779
5000,1.634


Saved fine-tuned model to 'sbert_model_1.pt'
Saved embeddings to 'sbert_output_1.pt'


## Output-2

In [None]:
import pandas as pd
import torch
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import random
from datasets import Dataset
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv("translated_output_2.csv")  # Replace with your CSV filename

# Preprocess text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['Comment_en'].astype(str).apply(preprocess_text)
df['CommentClass_en'] = df['CommentClass_en'].apply(eval)
df['label'] = df['CommentClass_en'].apply(lambda x: x[0] if len(x) > 0 else "Unknown")

# Generate positive pairs
input_examples = []
for label, group in df.groupby('label'):
    texts = group['cleaned_text'].tolist()
    # if len(texts) < 2:
    #     continue
    random.shuffle(texts)
    for i in range(0, len(texts) - 1, 2):
        input_examples.append(InputExample(texts=[texts[i], texts[i+1]]))

# 2748
print(f"Created {len(input_examples)} training pairs")

# Load base SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

#Fine-tune model
train_dataloader = DataLoader(input_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=20,
          warmup_steps=100,
          show_progress_bar=True)

# Save fine-tuned model
model.save("sbert_model_2")
torch.save(model, 'sbert_model_2.pt')
print("Saved fine-tuned model to 'sbert_model_2.pt'")

# Encode all comments using fine-tuned model
all_embeddings = model.encode(df['cleaned_text'].tolist(), convert_to_tensor=True)

# Bundle into dictionary and save as .pt
data_bundle = {
    'features': all_embeddings,
    'labels': df['CommentClass_en'].tolist(),
    'index': df['Unnamed: 0'].tolist()
}

torch.save(data_bundle, 'sbert_output_2.pt')
print("Saved embeddings to 'sbert_output_2.pt'")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Created 2748 training pairs


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,2.6128
1000,2.1958
1500,2.0229
2000,1.8559
2500,1.7331
3000,1.6357


Saved fine-tuned model to 'sbert_model_2.pt'
Saved embeddings to 'sbert_output_2.pt'
