# Capstone

In [31]:
# importing all my dependancies
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
import string
import re
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
corpus = pd.read_csv('data/labeled_newscatcher_dataset.csv', sep=";")
corpus.shape

(108774, 6)

In [3]:
corpus.head()

Unnamed: 0,topic,link,domain,published_date,title,lang
0,SCIENCE,https://www.eurekalert.org/pub_releases/2020-0...,eurekalert.org,2020-08-06 13:59:45,A closer look at water-splitting's solar fuel ...,en
1,SCIENCE,https://www.pulse.ng/news/world/an-irresistibl...,pulse.ng,2020-08-12 15:14:19,"An irresistible scent makes locusts swarm, stu...",en
2,SCIENCE,https://www.express.co.uk/news/science/1322607...,express.co.uk,2020-08-13 21:01:00,Artificial intelligence warning: AI will know ...,en
3,SCIENCE,https://www.ndtv.com/world-news/glaciers-could...,ndtv.com,2020-08-03 22:18:26,Glaciers Could Have Sculpted Mars Valleys: Study,en
4,SCIENCE,https://www.thesun.ie/tech/5742187/perseid-met...,thesun.ie,2020-08-12 19:54:36,Perseid meteor shower 2020: What time and how ...,en


In [4]:
# checking for nulls
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108774 entries, 0 to 108773
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   topic           108774 non-null  object
 1   link            108774 non-null  object
 2   domain          108774 non-null  object
 3   published_date  108774 non-null  object
 4   title           108774 non-null  object
 5   lang            108774 non-null  object
dtypes: object(6)
memory usage: 5.0+ MB


In [5]:
# As we can see here, the Science topic has only a third of the news stories compared to the other topics
# The class imbalance could cause an issue later 
corpus['topic'].value_counts()

TECHNOLOGY       15000
HEALTH           15000
WORLD            15000
ENTERTAINMENT    15000
SPORTS           15000
BUSINESS         15000
NATION           15000
SCIENCE           3774
Name: topic, dtype: int64

In [6]:
# checking the language of all the data (although not always accurate)
corpus['lang'].value_counts()

en    108774
Name: lang, dtype: int64

In [7]:
# checking out the titles
corpus.loc[100].title

'Insect can escape after being eaten by frog, scientists find'

## Preparing Train and Test Sets

In [8]:
# splitting my data and using the only 2 columns that I believe are relevant
# to the business question
X = corpus.title
y = corpus.topic

In [9]:
# creating my stopword list and a word net for my lemmatizer
sw = stopwords.words('english')

def get_wordnet_pos(words):
    if words.startswith('J'):
        return wordnet.ADJ
    elif words.startswith('V'):
        return wordnet.VERB
    elif words.startswith('N'):
        return wordnet.NOUN
    elif words.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
# cleaning my test data
X = [word.lower() for word in X]
X = [word for word in X if word not in sw]
X = pos_tag(X)  
X = [(word[0], get_wordnet_pos(word[1])) for word in X]
lemmatizer = WordNetLemmatizer() 
X = [lemmatizer.lemmatize(word[0], word[1]) for word in X]

In [13]:
# performing a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=549841, test_size=0.35)

### CountVectorizer

In [38]:
# starting off with a Count Vec and also removing stopwords from the list here just in case
cv = CountVectorizer(stop_words= sw)

train_vec = cv.fit_transform(X_train)
train_vec = pd.DataFrame.sparse.from_spmatrix(train_vec)
train_vec.columns = sorted(cv.vocabulary_)
train_vec.set_index(y_train.index, inplace=True)

In [39]:
# there seems to be some non-english words still included in the list
train_vec

Unnamed: 0,00,000,000cr,000ft,000m,000mah,000x,001,004,004s,...,éire,éireann,équipe,óg,ørsted,česko,čeština,ōtaki,ōtāhuhu,žilina
62795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13310,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
test_vec = cv.transform(X_test)
test_vec  = pd.DataFrame.sparse.from_spmatrix(test_vec)
test_vec.columns = sorted(cv.vocabulary_)
test_vec.set_index(y_test.index, inplace=True)

In [27]:
# using bayes for this equation
mnb = MultinomialNB()

mnb.fit(train_vec, y_train)

MultinomialNB()

In [28]:
y_hat = mnb.predict(test_vec)
accuracy_score(y_test, y_hat)

0.7923091066691182

### TF-IDF Vectorizer

In [33]:
tfidf = TfidfVectorizer()
train_vec2 = tfidf.fit_transform(X_train)
train_vec2  = pd.DataFrame.sparse.from_spmatrix(train_vec2)
train_vec2.columns = sorted(tfidf.vocabulary_)
train_vec2.set_index(y_train.index, inplace=True)

In [34]:
test_vec2 = tfidf.transform(X_test)
test_vec2  = pd.DataFrame.sparse.from_spmatrix(test_vec2)
test_vec2.columns = sorted(tfidf.vocabulary_)
test_vec2.set_index(y_test.index, inplace=True)

In [35]:
mnb2 = MultinomialNB()

mnb2.fit(train_vec2, y_train)
y_hat2 = mnb2.predict(test_vec2)

In [36]:
accuracy_score(y_test, y_hat2)

0.7821438890494077