## Importing neccessary modules

In [1]:
import pandas as pd
import re
from time import time
import json

from nltk.corpus import stopwords
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score,classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_20newsgroups

## Task1 Loading data

In [2]:
data=fetch_20newsgroups(subset='train')
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Task2

In [3]:
train=fetch_20newsgroups(subset='train',categories=['alt.atheism','comp.graphics','talk.religion.misc','sci.space'])
xtrain,xtest,ytrain,ytest=train_test_split(train.data,train.target,test_size=0.2,random_state=42)

In [4]:
vectorizer=TfidfVectorizer(stop_words=stopwords.words('english'))
xtrain_vec=vectorizer.fit_transform(xtrain)
xtest_vec=vectorizer.transform(xtest)

In [5]:
model=LogisticRegression(max_iter=10)
classifier=OneVsRestClassifier(model)
classifier.fit(xtrain_vec,ytrain)
ypred=classifier.predict(xtest_vec)

## Task3

In [6]:
matrix=confusion_matrix(ypred,ytest)
report=classification_report(ytest, ypred)
table=pd.DataFrame({'Ypred':ypred,'Ytest':ytest})

In [7]:
print('Classification report')
print(report)

print('Confusion Matrix')

print()
print(matrix)


table[:30]

Classification report
              precision    recall  f1-score   support

           0       0.96      0.95      0.95        92
           1       0.89      0.97      0.93       121
           2       0.95      0.96      0.95       120
           3       0.95      0.82      0.88        74

    accuracy                           0.93       407
   macro avg       0.94      0.92      0.93       407
weighted avg       0.94      0.93      0.93       407

Confusion Matrix

[[ 87   0   0   4]
 [  1 117   5   8]
 [  1   4 115   1]
 [  3   0   0  61]]


Unnamed: 0,Ypred,Ytest
0,1,1
1,3,3
2,0,0
3,2,2
4,2,2
5,1,1
6,0,0
7,2,2
8,1,1
9,2,2


## Task4

In [8]:
xtrain,xtest,ytrain,ytest=train_test_split(data.data,data.target,test_size=0.2,random_state=42)

In [9]:
vectorizer=TfidfVectorizer(stop_words=stopwords.words('english'))
xtrain_vec=vectorizer.fit_transform(xtrain)
xtest_vec=vectorizer.transform(xtest)

In [10]:
model=LogisticRegression(max_iter=10)
classifier=OneVsRestClassifier(model)
classifier.fit(xtrain_vec,ytrain)
ypred=classifier.predict(xtest_vec)

In [11]:
matrix=confusion_matrix(ypred,ytest)
report=classification_report(ytest, ypred)
table=pd.DataFrame({'Ypred':ypred,'Ytest':ytest})

In [12]:
print('Classification report')
print(report)

print('Confusion Matrix')

print()
print(matrix)



table[:30]

Classification report
              precision    recall  f1-score   support

           0       0.91      0.75      0.82        97
           1       0.68      0.88      0.76       104
           2       0.81      0.81      0.81       115
           3       0.74      0.70      0.72       123
           4       0.92      0.77      0.84       126
           5       0.81      0.89      0.85       106
           6       0.74      0.86      0.80       109
           7       0.89      0.88      0.88       139
           8       0.95      0.93      0.94       122
           9       0.99      0.96      0.98       102
          10       0.94      0.94      0.94       108
          11       0.79      0.98      0.87       125
          12       0.72      0.76      0.74       114
          13       0.97      0.92      0.94       119
          14       0.96      0.87      0.91       127
          15       0.65      0.91      0.76       122
          16       0.93      0.90      0.92       121
     

Unnamed: 0,Ypred,Ytest
0,4,4
1,2,2
2,6,6
3,16,16
4,10,10
5,4,4
6,18,18
7,5,5
8,11,14
9,18,18


## Interpretation:
In comparing the two classification reports, it is clear that the second model outperformed the first one. The accuracy score of the second model is 85%, which is 2% higher than the accuracy score of the first model. Moreover, the second model has higher precision, recall, and F1-score for each class than the first model. This indicates that the second model is better at predicting each class and is more balanced overall.

The confusion matrix of the second model also shows fewer misclassifications in each class than the first model. This suggests that the second model is better at distinguishing between each class and has a better understanding of the features that distinguish them. Additionally, the macro and weighted averages for the second model are both higher than those of the first model, indicating that the second model is better at predicting across all classes and is more balanced in terms of class distribution.