# News Headline Modeling

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import string
import re
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
corpus = pd.read_csv('data/labeled_newscatcher_dataset.csv', sep=";")
corpus.head()

Unnamed: 0,topic,link,domain,published_date,title,lang
0,SCIENCE,https://www.eurekalert.org/pub_releases/2020-0...,eurekalert.org,2020-08-06 13:59:45,A closer look at water-splitting's solar fuel ...,en
1,SCIENCE,https://www.pulse.ng/news/world/an-irresistibl...,pulse.ng,2020-08-12 15:14:19,"An irresistible scent makes locusts swarm, stu...",en
2,SCIENCE,https://www.express.co.uk/news/science/1322607...,express.co.uk,2020-08-13 21:01:00,Artificial intelligence warning: AI will know ...,en
3,SCIENCE,https://www.ndtv.com/world-news/glaciers-could...,ndtv.com,2020-08-03 22:18:26,Glaciers Could Have Sculpted Mars Valleys: Study,en
4,SCIENCE,https://www.thesun.ie/tech/5742187/perseid-met...,thesun.ie,2020-08-12 19:54:36,Perseid meteor shower 2020: What time and how ...,en


## Vectorization and Modeling

In [3]:
X = corpus.title
y = corpus.topic

In [4]:
# performing a train-test split first
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=549841, test_size=0.35)

### CountVectorizer

In [5]:
sw = stopwords.words('english') + list(string.punctuation)

In [6]:
# starting off with a Count train and also removing stopwords from the list here just in case
cv = CountVectorizer(stop_words= sw)

count_train = cv.fit_transform(X_train)
count_train = pd.DataFrame.sparse.from_spmatrix(count_train)
count_train.columns = sorted(cv.vocabulary_)
count_train.set_index(y_train.index, inplace=True)

In [7]:
count_train

Unnamed: 0,00,000,000cr,000ft,000m,000mah,000x,001,004,004s,...,éire,éireann,équipe,óg,ørsted,česko,čeština,ōtaki,ōtāhuhu,žilina
62795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13310,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
count_test = cv.transform(X_test)
count_test  = pd.DataFrame.sparse.from_spmatrix(count_test)
count_test.columns = sorted(cv.vocabulary_)
count_test.set_index(y_test.index, inplace=True)

In [9]:
# using bayes for this equation
count_mnb = MultinomialNB()

count_mnb.fit(count_train, y_train)
count_hat = count_mnb.predict(count_test)

print(f'This is the accuracy score: {accuracy_score(y_test, count_hat)}')
# print(f'This is the precision score: {precision_score(y_test, count_hat)}')
# print(f'This is the recall score: {recall_score(y_test, count_hat)}')

This is the accuracy score: 0.7919413726983794


In [24]:
count_mnb2 = MultinomialNB(alpha= 0.08)

count_mnb2.fit(count_train, y_train)
count_hat2 = count_mnb2.predict(count_test)

print(f'This is the accuracy score: {accuracy_score(y_test, count_hat2)}')
# print(f'This is the precision score: {precision_score(y_test, count_hat)}')
# print(f'This is the recall score: {recall_score(y_test, count_hat)}')

This is the accuracy score: 0.8021853904546767


### TF-IDF Vectorizer

In [26]:
tfidf = TfidfVectorizer(stop_words= sw)
tfidf_train = tfidf.fit_transform(X_train)
tfidf_train  = pd.DataFrame.sparse.from_spmatrix(tfidf_train)
tfidf_train.columns = sorted(tfidf.vocabulary_)
tfidf_train.set_index(y_train.index, inplace=True)

In [27]:
tfidf_train

Unnamed: 0,00,000,000cr,000ft,000m,000mah,000x,001,004,004s,...,éire,éireann,équipe,óg,ørsted,česko,čeština,ōtaki,ōtāhuhu,žilina
62795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
tfidf_test = tfidf.transform(X_test)
tfidf_test  = pd.DataFrame.sparse.from_spmatrix(tfidf_test)
tfidf_test.columns = sorted(tfidf.vocabulary_)
tfidf_test.set_index(y_test.index, inplace=True)

In [29]:
tfidf_mnb = MultinomialNB()

tfidf_mnb.fit(tfidf_train, y_train)
tfidf_hat = tfidf_mnb.predict(tfidf_test)

In [30]:
print(f'This is the accuracy score: {accuracy_score(y_test, tfidf_hat)}')
# print(f'This is the precision score: {precision_score(y_test, tfidf_hat)}')
# print(f'This is the recall score: {recall_score(y_test, tfidf_hat)}')

This is the accuracy score: 0.7881064327178167


In [35]:
tfidf_mnb2 = MultinomialNB(alpha= 0.2)

tfidf_mnb2.fit(tfidf_train, y_train)
tfidf_hat2 = tfidf_mnb2.predict(tfidf_test)
print(f'This is the accuracy score: {accuracy_score(y_test, tfidf_hat2)}')
# print(f'This is the precision score: {precision_score(y_test, tfidf_hat)}')
# print(f'This is the recall score: {recall_score(y_test, tfidf_hat)}')

This is the accuracy score: 0.8033673925034803


### Word2Vec

In [38]:
from gensim.models import Word2Vec


w2v_model = Word2Vec(sentences= X_train, size=100, window=5, min_count=2)

In [39]:
w2v_model.train(X_train, epochs=30, total_examples= w2v_model.corpus_count)

(38998891, 152376870)