<a href="https://colab.research.google.com/github/mathewsrc/Natural-Language-Processing-in-Python/blob/master/classifying_fake_news_using_supervised_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import polars as pl
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups

In [33]:
# Load the dataset
bunch = fetch_20newsgroups(subset='all', categories=['sci.space', 'comp.graphics'])

# Get the list of categories
categories = bunch.target_names

# Print the list of categories
print(categories)

# Create a dictionary with the data and target fields
data_dict = {
    'data': bunch.data,
    'target': bunch.target
}

# Create a dataframe from the dictionary
df = pl.from_dict(data_dict)

# Keep the first 1000 rows of the DataFrame
df = df.head(1000)

# Display the first 5 rows of the resulting DataFrame
print(df.head(5))

['comp.graphics', 'sci.space']
shape: (5, 2)
┌───────────────────────────────────┬────────┐
│ data                              ┆ target │
│ ---                               ┆ ---    │
│ str                               ┆ i64    │
╞═══════════════════════════════════╪════════╡
│ From: henry@zoo.toronto.edu (Hen… ┆ 1      │
│ From: leech@cs.unc.edu (Jon Leec… ┆ 1      │
│ From: jscotti@lpl.arizona.edu (J… ┆ 1      │
│ From: dchien@hougen.seas.ucla.ed… ┆ 1      │
│ From: robert@slipknot.rain.com (… ┆ 0      │
└───────────────────────────────────┴────────┘


## Split data into train and test

In [34]:
# Create a series to store the labels: y
y = df.select('target').to_numpy()

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['data'], y, test_size=0.33, random_state=42)

## Vectoring data with CountVectorizer

In [35]:
# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words='english')

# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)

print(count_vectorizer.get_feature_names_out()[:10])

['00' '000' '0000' '00000' '000005102000' '000021' '000050' '00041032'
 '0004136' '00043819']


## Vectoring data with TfidfVectorizer

In [36]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# Transform the training data: tfidf_train
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names_out()[:10])

print(tfidf_train[:5].A)

['00' '000' '0000' '00000' '000005102000' '000021' '000050' '00041032'
 '0004136' '00043819']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Training and test a classification model

### Multinomial Naive Bayes (MNB) 


Multinomial Naive Bayes (MNB) is a popular algorithm for text classification tasks. It is a probabilistic model that uses Bayes' theorem to calculate the probability of a document belonging to a certain class based on the words in it.

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [43]:
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train.ravel())

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
acc = metrics.accuracy_score(y_test, pred)
print(acc)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

0.9787878787878788
[[161   2]
 [  5 162]]
