### Import the required libraries and data
#### Our data is from the sklearn.datasets library

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

### Download the data into the emails variable
1. Our focus is on finding difference between emails in the categories baseball and hockey
2. We pass these as a list to the "categories" argument
3. Print the target_names to confirm these are the only categories downloaded

In [2]:
emails = fetch_20newsgroups(categories=["rec.sport.baseball", "rec.sport.hockey"])
print(emails.target_names)

['rec.sport.baseball', 'rec.sport.hockey']


### The data is stored in emails.data attribute, let us view how a row looks like

In [3]:
print(emails.data[5])

From: mmb@lamar.ColoState.EDU (Michael Burger)
Subject: More TV Info
Distribution: na
Nntp-Posting-Host: lamar.acns.colostate.edu
Organization: Colorado State University, Fort Collins, CO  80523
Lines: 36

United States Coverage:
Sunday April 18
  N.J./N.Y.I. at Pittsburgh - 1:00 EDT to Eastern Time Zone
  ABC - Gary Thorne and Bill Clement

  St. Louis at Chicago - 12:00 CDT and 11:00 MDT - to Central/Mountain Zones
  ABC - Mike Emerick and Jim Schoenfeld

  Los Angeles at Calgary - 12:00 PDT and 11:00 ADT - to Pacific/Alaskan Zones
  ABC - Al Michaels and John Davidson

Tuesday, April 20
  N.J./N.Y.I. at Pittsburgh - 7:30 EDT Nationwide
  ESPN - Gary Thorne and Bill Clement

Thursday, April 22 and Saturday April 24
  To Be Announced - 7:30 EDT Nationwide
  ESPN - To Be Announced


Canadian Coverage:

Sunday, April 18
  Buffalo at Boston - 7:30 EDT Nationwide
  TSN - ???

Tuesday, April 20
  N.J.D./N.Y. at Pittsburgh - 7:30 EDT Nationwide
  TSN - ???

Wednesday, April 21
  St. Louis a

### View the target for the email in index 5
1. **1**: a hockey email
2. **0**: a baseball email

In [4]:
emails.target[5]

1

### Split the data into training and test sets

1. **subset="train/test"**:  to load either the training or test dataset  
2. **shuffle=True**:     shuffle up the data 
3. **random_state=108**: used to help shuffle the data 

In [5]:
train_emails = fetch_20newsgroups(categories=["rec.sport.baseball", "rec.sport.hockey"],
                                 subset="train", shuffle="True", random_state=108)

In [6]:
test_emails = fetch_20newsgroups(categories=["rec.sport.baseball", "rec.sport.hockey"],
                                subset="test", shuffle="True", random_state=108)

### Create a variable that will count all the words in the emails

In [7]:
counter = CountVectorizer()

### Train counter with the possible words in the email: train_emails and test_emails

In [8]:
counter.fit(test_emails.data + train_emails.data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### Store the list of counts of words in the datasets into a new variable

In [9]:
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

### Create an object from the Naive Bayes Classifier

In [10]:
classifier = MultinomialNB()

### Fit the data to the created object
1. **1st argument**: list of word counts from the training data
2. **2nd argument**: possible category from the training data

In [11]:
classifier.fit(train_counts, train_emails.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Test the classifier with the test data and view how accurate it is
Our model is **97.23%** accurate in finding the difference between hockey and baseball emails

In [12]:
print(classifier.score(test_counts, test_emails.target))

0.9723618090452262


### Will a model with the category hardware and hockey be more accurate? Let us see
1. This model is way better than the previous **99.75%**
2. Obviously 2 emails about hardware and hockey will have more different terms than if both emails were about sports.

In [13]:
# store the emails from the categories into a variable
train_emails = fetch_20newsgroups(categories=["comp.sys.ibm.pc.hardware", "rec.sport.hockey"],
                                 subset="train", shuffle=True, random_state=108)
test_emails = fetch_20newsgroups(categories=["comp.sys.ibm.pc.hardware", "rec.sport.hockey"],
                                subset="test", shuffle=True, random_state=108)

# create a vectorizer to count the words in the categories 
counter = CountVectorizer()
counter.fit(test_emails.data + train_emails.data)

train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

# create a classifier to fit the words 
classifier = MultinomialNB()
classifier.fit(train_counts, train_emails.target)

# predict the accuracy of the mode using test data
print(classifier.score(test_counts, test_emails.target))


0.9974715549936789
