# News Headline Modeling

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
import string
import re
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
corpus = pd.read_csv('data/labeled_newscatcher_dataset.csv', sep=";")
corpus.head()

## Vectorization and Modeling

In [None]:
X = corpus.title
y = corpus.topic

In [None]:
# performing a train-test split first
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=549841, test_size=0.35)

### CountVectorizer

In [None]:
# starting off with a Count Vec and also removing stopwords from the list here just in case
cv = CountVectorizer(lowercase=False)

train_vec = cv.fit_transform(X_train)
train_vec = pd.DataFrame.sparse.from_spmatrix(train_vec)
train_vec.columns = sorted(cv.vocabulary_)
train_vec.set_index(y_train.index, inplace=True)

TypeError: expected string or bytes-like object

In [None]:
# there seems to be some non-english words still included in the list
train_vec

Unnamed: 0,00,000,000cr,000ft,000m,000mah,000x,001,004,004s,...,éire,éireann,équipe,óg,ørsted,česko,čeština,ōtaki,ōtāhuhu,žilina
62795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13310,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
test_vec = cv.transform(X_test)
test_vec  = pd.DataFrame.sparse.from_spmatrix(test_vec)
test_vec.columns = sorted(cv.vocabulary_)
test_vec.set_index(y_test.index, inplace=True)

In [None]:
# using bayes for this equation
mnb = MultinomialNB()

mnb.fit(train_vec, y_train)

MultinomialNB()

In [None]:
y_hat = mnb.predict(test_vec)
accuracy_score(y_test, y_hat)

0.7919413726983794

### TF-IDF Vectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words= sw)
train_vec2 = tfidf.fit_transform(X_train)
train_vec2  = pd.DataFrame.sparse.from_spmatrix(train_vec2)
train_vec2.columns = sorted(tfidf.vocabulary_)
train_vec2.set_index(y_train.index, inplace=True)

AttributeError: 'list' object has no attribute 'lower'

In [None]:
test_vec2 = tfidf.transform(X_test)
test_vec2  = pd.DataFrame.sparse.from_spmatrix(test_vec2)
test_vec2.columns = sorted(tfidf.vocabulary_)
test_vec2.set_index(y_test.index, inplace=True)

In [None]:
mnb2 = MultinomialNB()

mnb2.fit(train_vec2, y_train)
y_hat2 = mnb2.predict(test_vec2)

In [None]:
accuracy_score(y_test, y_hat2)

0.7881326994300123

### Word2Vec

In [None]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts


w2v_model = Word2Vec(sentences=common_texts,
                    vector_size=100, window=5, min_count=2)

In [None]:
w2v_model.train(X_train, epochs=30, total_examples= w2v_model.corpus_count)

(0, 152376750)

In [None]:
w2v_model.