In [1]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

True
1
0
<torch.cuda.device object at 0x70997658b160>
Tesla V100-SXM3-32GB
Using device: cuda


In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader, Dataset

# Load data
questions = pd.read_csv('Questions.csv', encoding="latin-1")
tags = pd.read_csv('Tags.csv')

# Merge questions and tags
data = pd.merge(questions[['Id', 'Title', 'Body']], tags, left_on='Id', right_on='Id')


def combine_title_and_body(row):
    return row['Title'] + " " + row['Body']


data['Text'] = data.apply(combine_title_and_body, axis=1)
data.drop(['Title', 'Body'], axis=1, inplace=True)

data = data[data['Tag'] != 'python']
data = data[data['Tag'] != 'python-2.7']
data = data[data['Tag'] != 'python-3.x']

# Simplify: Use top N tags
N = 15
top_tags = data['Tag'].value_counts().nlargest(N).index
data = data[data['Tag'].isin(top_tags)]

data = data.groupby('Id').agg({
    'Text': 'first',  # Retain unique Text for each question
    'Tag': list  # Combine tags into a list
}).reset_index()


# Group tags by Id and select the most common tag
def process_tags(tags):
    if len(tags) > 1:
        # Select the most common tag for multi-tag questions
        tag_counts = pd.Series(tags).value_counts()
        return tag_counts.idxmax()
    return tags[0]  # Keep the single tag as is for single-tag questions


data['Tag'] = data['Tag'].apply(process_tags)


# Preprocess text
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|mailto:\S+', '', text)
    text = re.sub(r'<code>.*?</code>', '', text, flags=re.DOTALL)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = text.lower().strip()  # Lowercase and strip
    return text


# Filter out empty titles after preprocessing
data['Text'] = data['Text'].apply(preprocess_text)
data = data[data['Text'].str.strip() != '']

# Encode tags
label_encoder = LabelEncoder()
data['Tag'] = label_encoder.fit_transform(data['Tag'])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(data['Text'], data['Tag'], test_size=0.2, random_state=42)


In [15]:
# Group tags for each question
grouped_tags = data.groupby('Id')['Tag'].apply(list)

# Count questions with multiple tags
print(grouped_tags.apply(len).value_counts())

# Look at overlapping tags
print(grouped_tags[grouped_tags.apply(len) > 1].head())

data.head()

1    224563
Name: Tag, dtype: int64
Series([], Name: Tag, dtype: object)


Unnamed: 0,Id,Text,Tag
0,683,using in to match an attribute of python objec...,0
1,742,class views in django django view points to a ...,3
2,766,python and mysql i can get python to work with...,9
3,1983,python what is the difference between 123 and ...,7
4,2933,how can i create a directlyexecutable crosspla...,14


In [4]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab

# Tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for sentence in X_train:
    counter.update(tokenizer(sentence))

vocab = vocab(counter, min_freq=1, specials=["<unk>", "<pad>"])
vocab_size = len(vocab)
vocab.set_default_index(vocab["<unk>"])


# Function to convert text to tensor
def text_to_tensor(text):
    return torch.tensor([vocab[token] for token in tokenizer(text)], dtype=torch.long)


In [5]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text_tensor = text_to_tensor(self.texts.iloc[idx])
        label_tensor = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return text_tensor, label_tensor


# Add a debug statement in the collate_fn function
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    if min(lengths) <= 0:
        print("Found empty sequence in batch:", lengths)
    texts = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True)
    return texts, torch.tensor(labels), torch.tensor(lengths)


# Datasets and DataLoaders
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)


In [6]:
import torch.nn as nn


class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        x = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hidden, _) = self.rnn(packed)
        out = self.fc(hidden[-1])
        return out


In [7]:
import torch.optim as optim
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(data['Tag']), y=data['Tag'])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Model, Loss, Optimizer
embed_size = 128
hidden_size = 128
num_classes = N  # Number of tags
model = RNNClassifier(vocab_size, embed_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.0002)

# Early stopping parameters
patience = 3  # Number of epochs to wait before stopping
best_val_loss = float('inf')
patience_counter = 0

# Training Loop with Early Stopping
num_epochs = 300

In [8]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Training
    for texts, labels, lengths in train_loader:
        # Move tensors to GPU
        texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)

        optimizer.zero_grad()
        outputs = model(texts, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels, lengths in test_loader:
            # Move tensors to GPU
            texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
            outputs = model(texts, lengths)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(test_loader)

    print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Early Stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0  # Reset counter

        torch.save(model.state_dict(), "best_model.pth")
    else:
        patience_counter += 1
        print(f"Early stopping patience counter: {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("Early stopping triggered. Stopping training.")
            break

Epoch [1/300], Train Loss: 2.4795, Val Loss: 2.2272
Epoch [2/300], Train Loss: 2.1817, Val Loss: 2.2019
Epoch [3/300], Train Loss: 2.0807, Val Loss: 2.1285
Epoch [4/300], Train Loss: 1.9402, Val Loss: 2.1280
Epoch [5/300], Train Loss: 1.9422, Val Loss: 1.9653
Epoch [6/300], Train Loss: 1.5258, Val Loss: 1.4806
Epoch [7/300], Train Loss: 0.9915, Val Loss: 1.0440
Epoch [8/300], Train Loss: 0.7189, Val Loss: 0.7332
Epoch [9/300], Train Loss: 0.5873, Val Loss: 0.6189
Epoch [10/300], Train Loss: 0.5084, Val Loss: 0.6000
Epoch [11/300], Train Loss: 0.4578, Val Loss: 0.5617
Epoch [12/300], Train Loss: 0.4115, Val Loss: 0.5760
Early stopping patience counter: 1/3
Epoch [13/300], Train Loss: 0.3733, Val Loss: 0.6169
Early stopping patience counter: 2/3
Epoch [14/300], Train Loss: 0.3361, Val Loss: 0.6414
Early stopping patience counter: 3/3
Early stopping triggered. Stopping training.


In [9]:
from sklearn.metrics import classification_report

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for texts, labels, lengths in test_loader:
        # Move tensors to GPU
        texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)

        outputs = model(texts, lengths)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))


                   precision    recall  f1-score   support

           arrays       0.51      0.69      0.58      1823
              csv       0.75      0.79      0.77      1622
       dictionary       0.80      0.74      0.77      1822
           django       0.98      0.88      0.93     12045
            flask       0.84      0.92      0.87      1922
google-app-engine       0.89      0.91      0.90      1839
             json       0.73      0.82      0.77      1543
             list       0.82      0.70      0.75      3174
       matplotlib       0.84      0.89      0.86      2678
            mysql       0.76      0.85      0.80      1486
            numpy       0.60      0.65      0.62      3956
           pandas       0.88      0.79      0.83      4367
            regex       0.91      0.83      0.87      2795
           string       0.53      0.78      0.63      1775
          tkinter       0.96      0.87      0.91      2066

         accuracy                           0.82     4

In [16]:
import torch
from torch.nn import functional as F
import random

# Load the saved model
model = RNNClassifier(vocab_size, embed_size, hidden_size, num_classes)  # Adjust with your model definition
model.load_state_dict(torch.load("best_model.pth"))
model.eval()  # Set model to evaluation mode

# Tokenize and pad the input
def tokenize_and_prepare_input(text, tokenizer, vocab, max_length=100):
    tokens = tokenizer(text)[:max_length]  # Tokenize and truncate to max_length
    input_tensor = torch.tensor([vocab[token] if token in vocab else vocab['<unk>'] for token in tokens], dtype=torch.long)

    # Calculate the sequence length
    seq_length = len(input_tensor)

    # Pad the sequence to max_length
    if seq_length < max_length:
        padding = torch.zeros(max_length - seq_length, dtype=torch.long)
        input_tensor = torch.cat([input_tensor, padding])

    input_tensor = input_tensor.unsqueeze(0)  # Add batch dimension
    return input_tensor, torch.tensor([seq_length], dtype=torch.long)  # Return input tensor and sequence length

tag_to_text = data.groupby('Tag')['Text'].apply(list).to_dict()

# Take user input and categorize it
while True:
    user_input = input("Enter your question (or type 'exit' to quit): ")
    if user_input.lower() == "exit":
        break

    # Prepare the input without removing any text
    input_tensor, seq_length = tokenize_and_prepare_input(user_input, tokenizer, vocab)

    # Predict the tag
    with torch.no_grad():
        outputs = model(input_tensor, seq_length)  # Pass lengths to the model
        probabilities = F.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()

    if predicted_class in tag_to_text:
        random_dialog = random.choice(tag_to_text[predicted_class])
    else:
        random_dialog = "No examples available for this tag."

    # Map the predicted class to the tag name
    predicted_tag = label_encoder.inverse_transform([predicted_class])[0]

    print(f"Predicted Tag: {predicted_tag}")
    print(f"Dialog with predicted tag: {random_dialog}")


Predicted Tag: django
Dialog with predicted tag: pythonsocialauth shows authstateforbidden sometimes sometimes when i try to login or register with facebook or google it returns me an error authstateforbidden screen but just refreshing the page or trying again after a while it run correctly ive tried adding google api in google developers but is the same problem with facebook any idea thanks in advance


KeyboardInterrupt: Interrupted by user