In [30]:
from sklearn.datasets import fetch_20newsgroups

categories = ['rec.autos',  'talk.politics.guns','comp.os.ms-windows.misc']

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True) 

# print(f"Number of documents in test set: {len(twenty_test.data)}")
# print(f"Target names: {twenty_test.target_names}")
# print(f"First document:\n{twenty_test.data[0]}")
# print(f"First document target: {twenty_test.target[0]}")



In [31]:
data = fetch_20newsgroups(subset='train')

print(data.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [32]:
from torch import nn

class linearClassifire(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin = nn.Linear(32, 3)

    def forward(self, x):
        return self.lin(x)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Function to load and transform data

def loadAndTransformData(tfidf=True, featureIns = 32):
    categories = ['comp.graphics', 'sci.med']

    twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
    twenty_val = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)


    # Vectorization 
    if tfidf:
        vectorizer = TfidfVectorizer(max_features=featureIns)
    else:
        vectorizer = CountVectorizer(max_features=featureIns)

    # Transform the data
    X_train = vectorizer.fit_transform(twenty_train.data).toarray()
    
    y_train = twenty_train.target
    
    X_test = vectorizer.transform(twenty_val.data).toarray()
    y_test = twenty_val.target

    return X_train, y_train, X_test, y_test

In [34]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import torch



X_train, y_train, X_test, y_test = loadAndTransformData(False)

# Convert to PyTorch tensors
trainIns = torch.FloatTensor(X_train)
testIns = torch.FloatTensor(X_test)
trainTgt = torch.LongTensor(y_train)
testTgt = torch.LongTensor(y_test)

print(f"Train Input Shape: {trainIns.shape}")
print(f"Test Input Shape: {testIns.shape}")
print(f"Train Target Shape: {trainTgt.shape}")
print(f"Test Target Shape: {testTgt.shape}")

Train Input Shape: torch.Size([1178, 32])
Test Input Shape: torch.Size([785, 32])
Train Target Shape: torch.Size([1178])
Test Target Shape: torch.Size([785])


In [35]:
# Split test set into evaluation and final test sets
evalTgt = testTgt[:400]
evalIns = testIns[:400]
print(f"Eval Input Shape: {evalIns.shape}")
print(f"Eval Target Shape: {evalTgt.shape}")
testTgt = testTgt[400:]
testIns = testIns[400:]
print(f"Test Input Shape: {testIns.shape}")
print(f"Test Target Shape: {testTgt.shape}")

Eval Input Shape: torch.Size([400, 32])
Eval Target Shape: torch.Size([400])
Test Input Shape: torch.Size([385, 32])
Test Target Shape: torch.Size([385])


In [44]:
model = linearClassifire()
lossFct = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001) # Stochastic Gradient Descent

In [45]:
#training loop and evaluation code would go here

for epoch in range(500):
    predictions = model(trainIns)
    loss = lossFct(predictions, trainTgt)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Evaluation on eval set every 50 epochs
    if epoch % 50 == 0:
        
        with torch.no_grad():
            predEval = model(evalIns)
            evalLoss = lossFct(predEval, evalTgt)

In [46]:
#test evaluation

with torch.no_grad():
    testPreds = model(testIns)
    testPreds = torch.topk(testPreds, 1).indices.view(-1) # get predicted class indices
    print("Confusion Matrix:")
    print(confusion_matrix(testTgt, testPreds)) # get confusion matrix
    print(f"F1 Score: {f1_score(testTgt, testPreds)}") # get F1 score
    print(f"Accuracy: {accuracy_score(testTgt, testPreds)}") # get accuracy
    

Confusion Matrix:
[[111  66]
 [ 95 113]]
F1 Score: 0.5839793281653747
Accuracy: 0.5818181818181818


In [47]:
preds = model(trainIns[0])
print(f"Sample Prediction: {preds}")
smax = nn.Softmax(dim=0) # apply softmax to get class probabilities
probs = smax(preds) # get class probabilities
print(f"Class Probabilities: {probs}")
print(f"Predicted Class: {torch.argmax(probs)}")


Sample Prediction: tensor([ 1.5600,  1.8724, -3.1992], grad_fn=<ViewBackward0>)
Class Probabilities: tensor([0.4210, 0.5754, 0.0036], grad_fn=<SoftmaxBackward0>)
Predicted Class: 1
