## NewsGroup Classification
1) Importing Libraries
2) Importing and Trasnforming Data
3) Experimenting ML Pipelines
4) E2E ML Pipeline

### Importing Libraries

In [217]:
import pandas as pd 
import numpy as np
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

### Importing and Trasnforming Data

In [82]:
newsgroups = fetch_20newsgroups(subset='train')

In [85]:
Counter([newsgroups.target_names[x] for x in newsgroups.target])

Counter({'rec.autos': 594,
         'comp.sys.mac.hardware': 578,
         'comp.graphics': 584,
         'sci.space': 593,
         'talk.politics.guns': 546,
         'sci.med': 594,
         'comp.sys.ibm.pc.hardware': 590,
         'comp.os.ms-windows.misc': 591,
         'rec.motorcycles': 598,
         'talk.religion.misc': 377,
         'misc.forsale': 585,
         'alt.atheism': 480,
         'sci.electronics': 591,
         'comp.windows.x': 593,
         'rec.sport.hockey': 600,
         'rec.sport.baseball': 597,
         'soc.religion.christian': 599,
         'talk.politics.mideast': 564,
         'talk.politics.misc': 465,
         'sci.crypt': 595})

In [88]:
Counter([newsgroups.target_names[x].split('.')[0] for x in newsgroups.target])

Counter({'rec': 2389,
         'comp': 2936,
         'sci': 2373,
         'talk': 1952,
         'misc': 585,
         'alt': 480,
         'soc': 599})

In [89]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.4, random_state=42)

In [92]:
le = LabelEncoder()
y_train_tr = le.fit_transform(y_train)
y_test_tr = le.transform(y_test)

In [119]:
tfidf = TfidfVectorizer(stop_words=list(ENGLISH_STOP_WORDS), max_features=100)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [120]:
countvec = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS), max_features=100)
X_train_countvec = countvec.fit_transform(X_train)
X_test_countvec = countvec.transform(X_test)

In [123]:
X_train_tfidf.shape, X_test_tfidf.shape

((6788, 100), (4526, 100))

In [122]:
X_train_countvec.shape, X_test_countvec.shape

((6788, 100), (4526, 100))

### Experimenting ML Pipelines

In [134]:
algos = {'nb': BernoulliNB(), 'mlp': MLPClassifier(), 'knn': KNeighborsClassifier(), 'sgd': SGDClassifier()}
for algo, clf in algos.items():
    print(algo)
    clf.fit(X_train_tfidf, y_train_tr)
    print('Train score: ', clf.score(X_train_tfidf, y_train_tr))
    print('Test score: ', clf.score(X_test_tfidf, y_test_tr))
    print('\n\n')

nb
Train score:  0.37315851502651737
Test score:  0.3393725143614671



mlp




Train score:  0.5847083087802003
Test score:  0.40300486080424214



knn
Train score:  0.5870654095462581
Test score:  0.3623508616880248



sgd
Train score:  0.3601944608131998
Test score:  0.32456915598762703





In [135]:
algos = {'nb': BernoulliNB(), 'mlp': MLPClassifier(), 'knn': KNeighborsClassifier(), 'sgd': SGDClassifier()}
for algo, clf in algos.items():
    print(algo)
    clf.fit(X_train_countvec, y_train_tr)
    print('Train score: ', clf.score(X_train_countvec, y_train_tr))
    print('Test score: ', clf.score(X_test_countvec, y_test_tr))
    print('\n\n')

nb
Train score:  0.37315851502651737
Test score:  0.3393725143614671



mlp




Train score:  0.7953741897466117
Test score:  0.3654441007512152



knn
Train score:  0.5302003535651149
Test score:  0.2761820592134335



sgd
Train score:  0.2869770182675309
Test score:  0.2534246575342466





### E2E ML Pipeline

In [219]:
newsgroups = fetch_20newsgroups(subset='train')
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.4, random_state=42)

le = LabelEncoder()
y_train_tr = le.fit_transform(y_train)
y_test_tr = le.transform(y_test)

countvec = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS), max_features=250)
X_train_countvec = countvec.fit_transform(X_train)
X_test_countvec = countvec.transform(X_test)

nb = BernoulliNB()
nb.fit(X_train_countvec, y_train_tr)

In [220]:
y_test_hat = nb.predict(X_test_countvec)

In [222]:
print(classification_report(y_test_tr, y_test_hat))

              precision    recall  f1-score   support

           0       0.46      0.42      0.44       180
           1       0.48      0.44      0.46       220
           2       0.66      0.58      0.62       230
           3       0.53      0.30      0.38       241
           4       0.22      0.24      0.23       231
           5       0.60      0.53      0.56       233
           6       0.24      0.77      0.37       224
           7       0.55      0.40      0.46       252
           8       0.31      0.36      0.33       259
           9       0.41      0.48      0.44       223
          10       0.66      0.52      0.58       216
          11       0.72      0.65      0.68       259
          12       0.45      0.36      0.40       247
          13       0.41      0.38      0.39       231
          14       0.67      0.55      0.60       238
          15       0.65      0.73      0.69       241
          16       0.61      0.51      0.56       238
          17       0.72    

In [255]:
feature_names = countvec.get_feature_names_out()
df_prob = pd.DataFrame(np.exp(nb.feature_log_prob_), index=newsgroups.target_names, columns=feature_names)

In [270]:
for i, row in df_prob.iterrows():
    print(i)
    print(row.sort_values(ascending=False).head(10))
    print('\n\n')

alt.atheism
subject         0.996689
lines           0.996689
organization    0.937086
edu             0.847682
writes          0.821192
article         0.625828
posting         0.509934
host            0.470199
nntp            0.460265
com             0.453642
Name: alt.atheism, dtype: float64



comp.graphics
lines           0.997268
subject         0.997268
organization    0.959016
edu             0.560109
posting         0.494536
host            0.464481
nntp            0.453552
university      0.442623
graphics        0.346995
writes          0.327869
Name: comp.graphics, dtype: float64



comp.os.ms-windows.misc
subject         0.997245
lines           0.994490
organization    0.975207
windows         0.669421
edu             0.652893
university      0.471074
posting         0.410468
host            0.402204
nntp            0.396694
writes          0.366391
Name: comp.os.ms-windows.misc, dtype: float64



comp.sys.ibm.pc.hardware
subject         0.997151
lines           0.991453


In [302]:
newsgroups = fetch_20newsgroups(subset='train')
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.4, random_state=42)

le = LabelEncoder()
y_train_tr = le.fit_transform(y_train)
y_test_tr = le.transform(y_test)

stop_words = list(ENGLISH_STOP_WORDS) + ['subject', 'lines', 'organization', 'edu', 'com', 'writes', 'article', 'posting', 'nntp', 'like', 'host', 'university']
countvec = CountVectorizer(stop_words=stop_words, max_features=250)
X_train_countvec = countvec.fit_transform(X_train)
X_test_countvec = countvec.transform(X_test)

nb = BernoulliNB()
nb.fit(X_train_countvec, y_train_tr)
y_test_hat = nb.predict(X_test_countvec)

In [305]:
print(classification_report(y_test_tr, y_test_hat))

              precision    recall  f1-score   support

           0       0.43      0.42      0.43       180
           1       0.50      0.44      0.47       220
           2       0.70      0.64      0.67       230
           3       0.55      0.37      0.45       241
           4       0.50      0.42      0.45       231
           5       0.64      0.59      0.61       233
           6       0.23      0.76      0.35       224
           7       0.56      0.43      0.49       252
           8       0.30      0.36      0.33       259
           9       0.43      0.48      0.45       223
          10       0.66      0.55      0.60       216
          11       0.72      0.63      0.67       259
          12       0.43      0.36      0.39       247
          13       0.41      0.39      0.40       231
          14       0.68      0.55      0.61       238
          15       0.67      0.59      0.63       241
          16       0.62      0.52      0.56       238
          17       0.71    

In [306]:
feature_names = countvec.get_feature_names_out()
df_prob = pd.DataFrame(np.exp(nb.feature_log_prob_), index=newsgroups.target_names, columns=feature_names)

In [308]:
term_dict = {}
for i, row in df_prob.iterrows():
    top10_terms = row.sort_values(ascending=False).head(10)
    print(i)
    print(top10_terms)
    print('\n\n')
    for t in top10_terms.index:
        if t in term_dict.keys():
            term_dict[t]+=1
        else:
            term_dict[t] = 1

alt.atheism
don       0.440397
people    0.417219
god       0.377483
think     0.377483
say       0.364238
just      0.364238
know      0.307947
does      0.304636
said      0.274834
world     0.254967
Name: alt.atheism, dtype: float64



comp.graphics
graphics    0.346995
know        0.319672
thanks      0.316940
does        0.270492
help        0.234973
use         0.232240
need        0.229508
just        0.226776
don         0.210383
program     0.202186
Name: comp.graphics, dtype: float64



comp.os.ms-windows.misc
windows         0.669421
use             0.314050
know            0.297521
using           0.294766
thanks          0.292011
does            0.269972
file            0.256198
reply           0.242424
just            0.242424
distribution    0.234160
Name: comp.os.ms-windows.misc, dtype: float64



comp.sys.ibm.pc.hardware
thanks          0.324786
just            0.310541
does            0.304843
know            0.304843
use             0.296296
drive           0.273504


In [309]:
term_dict

{'don': 17,
 'people': 7,
 'god': 3,
 'think': 12,
 'say': 3,
 'just': 18,
 'know': 17,
 'does': 10,
 'said': 1,
 'world': 4,
 'graphics': 1,
 'thanks': 6,
 'help': 2,
 'use': 7,
 'need': 2,
 'program': 1,
 'windows': 1,
 'using': 2,
 'file': 1,
 'reply': 4,
 'distribution': 10,
 'drive': 1,
 'computer': 2,
 'card': 1,
 'mac': 1,
 'apple': 1,
 'new': 6,
 'window': 1,
 'problem': 1,
 'mit': 1,
 'mail': 1,
 'usa': 3,
 '10': 1,
 'email': 1,
 'used': 2,
 '25': 1,
 'car': 1,
 'good': 8,
 'time': 7,
 'ca': 2,
 've': 1,
 'year': 2,
 'game': 2,
 'team': 2,
 'games': 1,
 'play': 1,
 'key': 1,
 'chip': 1,
 'encryption': 1,
 'government': 3,
 'public': 1,
 'work': 1,
 'cs': 1,
 'space': 1,
 'nasa': 1,
 'long': 1,
 '1993': 1,
 'gun': 1,
 'israel': 1,
 'years': 1,
 'state': 1,
 'make': 1,
 'news': 1,
 'christian': 1}