In [42]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

A toy example in text classification, to get some practice with sklearn text feature extractors.

We'll use the 20 newsgroup dataset, restricted to two categories only so we can set up a simple binary classification.

In [133]:
remove = ('headers','quotes')

# use only messages from the cryptography and medicine boards
cats = ('sci.crypt','sci.med')

newsgroups_train = fetch_20newsgroups(subset='train',remove=remove,categories=cats)  
newsgroups_test = fetch_20newsgroups(subset='test',remove=remove,categories=cats)

In [134]:
classes = newsgroups_train.target_names
classes

['sci.crypt', 'sci.med']

Size of the corpus

In [135]:
len(newsgroups_train.data)

1189

In [138]:
# vectorizer = CountVectorizer(ngram_range=(1,1),stop_words='english')
vectorizer = TfidfVectorizer(ngram_range=(1,1),stop_words='english') # (1,1) means using only unigrams

The vectorizer produces CSR sparse matrices, we should use classifiers that support those without converting back to dense matrices. `LogisticRegression` works.

In [139]:
vectorizer.fit(newsgroups_train.data)

In [140]:
X_train = vectorizer.transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [141]:
logreg = LogisticRegression()

In [142]:
logreg.fit(X_train,y_train)

The vectorizer learns a vocabulary, we can use it to figure out which words more strongly indicate a text is medicine or cryptography.

In [145]:
vocabulary = pd.Series({vectorizer.vocabulary_[k]:k for k in vectorizer.vocabulary_}).to_frame().rename(columns={0:"word"})
vocabulary.head(3)

Unnamed: 0,word
19528,simple
5403,choice
3123,american


In [147]:
importances = pd.Series(dict(list(enumerate(logreg.coef_[0])))).to_frame().rename(columns={0:'importance'})
importances.head(3)

Unnamed: 0,importance
0,0.138534
1,0.021721
2,0.005976


'msg', 'doctor', 'food' are the words that most heavily indicate a text is from the medicine board, while cryptography is strongly associated with 'key', 'government' and 'encryption'. 

In [148]:
pd.merge(vocabulary,importances,left_index=True,right_index=True).sort_values('importance',ascending=False)

Unnamed: 0,word,importance
14627,msg,1.831120
7771,doctor,1.551587
9555,food,1.485328
7607,disease,1.312825
13969,medical,1.304650
...,...,...
5387,chip,-2.049432
5614,clipper,-2.242852
8446,encryption,-2.399640
10241,government,-2.448625


In [149]:
y_pred = logreg.predict(X_test)

This task is obviously easy, as we can see from the very high validation metrics: the F1 score reaches 0.92.

In [150]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.86      0.92       396
           1       0.88      0.98      0.93       396

    accuracy                           0.92       792
   macro avg       0.93      0.92      0.92       792
weighted avg       0.93      0.92      0.92       792

