# News Headline Modeling

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import nltk
from nltk.corpus import stopwords, wordnet
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_confusion_matrix 
import string
import re
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# corpus = pd.read_csv('data/labeled_newscatcher_dataset.csv', sep=";")
corpus = pd.read_csv('/content/drive/MyDrive/labeled_newscatcher_dataset.csv', sep=";")
corpus.head()

Unnamed: 0,topic,link,domain,published_date,title,lang
0,SCIENCE,https://www.eurekalert.org/pub_releases/2020-0...,eurekalert.org,2020-08-06 13:59:45,A closer look at water-splitting's solar fuel ...,en
1,SCIENCE,https://www.pulse.ng/news/world/an-irresistibl...,pulse.ng,2020-08-12 15:14:19,"An irresistible scent makes locusts swarm, stu...",en
2,SCIENCE,https://www.express.co.uk/news/science/1322607...,express.co.uk,2020-08-13 21:01:00,Artificial intelligence warning: AI will know ...,en
3,SCIENCE,https://www.ndtv.com/world-news/glaciers-could...,ndtv.com,2020-08-03 22:18:26,Glaciers Could Have Sculpted Mars Valleys: Study,en
4,SCIENCE,https://www.thesun.ie/tech/5742187/perseid-met...,thesun.ie,2020-08-12 19:54:36,Perseid meteor shower 2020: What time and how ...,en


## Vectorization and Modeling

In [None]:
X = corpus.title
y = corpus.topic

In [None]:
# performing a train-test split first
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=549841, test_size=0.35)

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
sw = stopwords.words('english')

### CountVectorizer

In [None]:
count_mnb = Pipeline(steps=([("vectorize", CountVectorizer(stop_words= sw)), 
                            ("mnb", MultinomialNB())]))

parameters = {'mnb__alpha': np.linspace(0, 1.0, 5)}

count_gs = GridSearchCV(estimator= count_mnb,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

count_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {count_gs.score(X_train, y_train)}')
print(f'This is the testing score: {count_gs.score(X_test, y_test)}')
print(f'Best parameters: {count_gs.best_params_}')

This is the training score: 0.8780108340523033
This is the testing score: 0.8008720548448951
Best parameters: {'mnb__alpha': 0.25}


In [None]:
count_forest = Pipeline(steps=([("vectorize", CountVectorizer(stop_words= sw)), 
                                ("rf", RandomForestClassifier())]))

parameters = {'rf__class_weight': ['balanced', 'balanced_subsample']}

count_rf = GridSearchCV(estimator= count_forest,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

count_rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {count_rf.score(X_train, y_train)}')
print(f'This is the testing score: {count_rf.score(X_test, y_test)}')
print(f'Best parameters: {count_rf.best_params_}')

This is the training score: 0.995926622632703
This is the testing score: 0.7526726379659058
Best parameters: {'rf__class_weight': 'balanced'}


In [None]:
count_forest = Pipeline(steps=([("vectorize", CountVectorizer(stop_words= sw)), 
                                ("rf", RandomForestClassifier(class_weight='balanced'))]))

parameters = {'rf__min_samples_leaf': [2, 10, 30, 100]}

count_rf2 = GridSearchCV(estimator= count_forest,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

count_rf2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {count_rf2.score(X_train, y_train)}')
print(f'This is the testing score: {count_rf2.score(X_test, y_test)}')
print(f'Best parameters: {count_rf2.best_params_}')

This is the training score: 0.7704199256042884
This is the testing score: 0.7135615035066061
Best parameters: {'rf__min_samples_leaf': 2}


In [None]:
count_forest = Pipeline(steps=([("vectorize", CountVectorizer(stop_words= sw)), 
                                ("rf", RandomForestClassifier(class_weight='balanced', min_samples_leaf=2))]))

parameters = {'rf__min_samples_split': [2, 10, 20]}

count_rf3 = GridSearchCV(estimator= count_forest,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

count_rf3.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {count_rf3.score(X_train, y_train)}')
print(f'This is the testing score: {count_rf3.score(X_test, y_test)}')
print(f'Best parameters: {count_rf3.best_params_}')

This is the training score: 0.7699956154618616
This is the testing score: 0.713141236111476
Best parameters: {'rf__min_samples_split': 2}


In [None]:
count_forest = Pipeline(steps=([("vectorize", CountVectorizer(stop_words= sw)), 
                                ("rf", RandomForestClassifier(class_weight='balanced', min_samples_leaf=2))]))

parameters = {'rf__n_estimators': [50, 100, 150]}

count_rf4 = GridSearchCV(estimator= count_forest,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

count_rf4.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {count_rf4.score(X_train, y_train)}')
print(f'This is the testing score: {count_rf4.score(X_test, y_test)}')
print(f'Best parameters: {count_rf4.best_params_}')

This is the training score: 0.7699248971047905
This is the testing score: 0.713141236111476
Best parameters: {'rf__n_estimators': 150}


### TF-IDF Vectorizer

In [None]:
tfidf_mnb = Pipeline(steps=([("vectorize", TfidfVectorizer(stop_words= sw)),
                            ("mnb", MultinomialNB())]))

parameters = {'mnb__alpha': np.linspace(0, 1.0, 5)}

tfidf_gs = GridSearchCV(estimator= tfidf_mnb,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

tfidf_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {tfidf_gs.score(X_train, y_train)}')
print(f'This is the testing score: {tfidf_gs.score(X_test, y_test)}')
print(f'Best parameters: {tfidf_gs.best_params_}')

This is the training score: 0.8851675317879015
This is the testing score: 0.8019752567571117
Best parameters: {'mnb__alpha': 0.25}


In [None]:
tfidf_forest = Pipeline(steps=([("vectorize", TfidfVectorizer(stop_words= sw)), 
                                ("rf", RandomForestClassifier(class_weight='balanced'))]))

parameters = {'rf__min_samples_leaf': [2, 3, 4, 5]}

tfidf_rf = GridSearchCV(estimator= tfidf_forest,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

tfidf_rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {tfidf_rf.score(X_train, y_train)}')
print(f'This is the testing score: {tfidf_rf.score(X_test, y_test)}')
print(f'Best parameters: {tfidf_rf.best_params_}')

This is the training score: 0.7882409515862128
This is the testing score: 0.7120380341992593
Best parameters: {'rf__min_samples_leaf': 2}


In [None]:
tfidf_forest = Pipeline(steps=([("vectorize", TfidfVectorizer(stop_words= sw)), 
                                ("rf", RandomForestClassifier(class_weight='balanced', min_samples_leaf=2))]))

parameters = {'rf__min_samples_split': [2, 5, 10]}

tfidf_rf2 = GridSearchCV(estimator= tfidf_forest,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

tfidf_rf2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {tfidf_rf2.score(X_train, y_train)}')
print(f'This is the testing score: {tfidf_rf2.score(X_test, y_test)}')
print(f'Best parameters: {tfidf_rf2.best_params_}')

This is the training score: 0.7880570838578278
This is the testing score: 0.7129311024139109
Best parameters: {'rf__min_samples_split': 5}


In [None]:
tfidf_forest = Pipeline(steps=([("vectorize", TfidfVectorizer(stop_words= sw)), 
                                ("rf", RandomForestClassifier(class_weight='balanced', min_samples_leaf=2,
                                                              min_samples_split=5))]))

parameters = {'rf__n_estimators': [350, 400, 450]}

tfidf_rf3 = GridSearchCV(estimator= tfidf_forest,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

tfidf_rf3.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {tfidf_rf3.score(X_train, y_train)}')
print(f'This is the testing score: {tfidf_rf3.score(X_test, y_test)}')
print(f'Best parameters: {tfidf_rf3.best_params_}')

This is the training score: 0.7901927782413759
This is the testing score: 0.7135352367944104
Best parameters: {'rf__n_estimators': 350}


In [None]:
tfidf_forest = Pipeline(steps=([("vectorize", TfidfVectorizer(stop_words= sw)), 
                                ("rf", RandomForestClassifier(class_weight='balanced', min_samples_leaf=2,
                                                              min_samples_split=5, n_estimators= 350))]))

parameters = {'rf__max_depth': [400, 450, 500]}

tfidf_rf4 = GridSearchCV(estimator= tfidf_forest,
                 param_grid= parameters,
                 cv=5, error_score= 'raise')

tfidf_rf4.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('vectorize',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                    

In [None]:
print(f'This is the training score: {tfidf_rf4.score(X_train, y_train)}')
print(f'This is the testing score: {tfidf_rf4.score(X_test, y_test)}')
print(f'Best parameters: {tfidf_rf4.best_params_}')

This is the training score: 0.7208039262831846
This is the testing score: 0.6876887919939061
Best parameters: {'rf__max_depth': 500}
