In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,confusion_matrix
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mrunal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
news = pd.read_csv('uci-news-aggregator.csv')

In [3]:
news.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [4]:
news = news[['TITLE','CATEGORY']]

In [5]:
dist = news['CATEGORY'].value_counts()

In [6]:
dist

e    152469
b    115967
t    108344
m     45639
Name: CATEGORY, dtype: int64

In [7]:
stop = set(stopwords.words('english'))

In [8]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [9]:
#To retain only alphabets - sub is like replace
# replace - If you use replace() or translate(), they will be replaced if they completely match the old string.
#re.sub - If you want to replace a string that matches a regular expression instead of perfect match, use the sub() of the re module.

news['TITLE'] = news['TITLE'].apply(lambda x: re.sub("[^a-zA-Z]", " ",x))
news['TITLE']

0         Fed official says weak data caused by weather ...
1         Fed s Charles Plosser sees high bar for change...
2         US open  Stocks fall after Fed official hints ...
3         Fed risks falling  behind the curve   Charles ...
4         Fed s Plosser  Nasty Weather Has Curbed Job Gr...
                                ...                        
422414    Surgeons to remove   year old s rib to rebuild...
422415    Boy to have surgery on esophagus after battery...
422416    Child who swallowed battery to have reconstruc...
422417    Phoenix boy undergoes surgery to repair throat...
422418    Phoenix boy undergoes surgery to repair throat...
Name: TITLE, Length: 422419, dtype: object

In [10]:
# convert to lowercase and tokenize
news['TITLE'] = news['TITLE'].apply(lambda x: x.lower().split())
news['TITLE']

0         [fed, official, says, weak, data, caused, by, ...
1         [fed, s, charles, plosser, sees, high, bar, fo...
2         [us, open, stocks, fall, after, fed, official,...
3         [fed, risks, falling, behind, the, curve, char...
4         [fed, s, plosser, nasty, weather, has, curbed,...
                                ...                        
422414    [surgeons, to, remove, year, old, s, rib, to, ...
422415    [boy, to, have, surgery, on, esophagus, after,...
422416    [child, who, swallowed, battery, to, have, rec...
422417    [phoenix, boy, undergoes, surgery, to, repair,...
422418    [phoenix, boy, undergoes, surgery, to, repair,...
Name: TITLE, Length: 422419, dtype: object

In [11]:
# remove stopwords - For loop for removing stop words in above mentioned set of stop words
news['TITLE'] = news['TITLE'].apply(lambda x:[i for i in x if i not in stop])
news['TITLE']

0         [fed, official, says, weak, data, caused, weat...
1         [fed, charles, plosser, sees, high, bar, chang...
2         [us, open, stocks, fall, fed, official, hints,...
3         [fed, risks, falling, behind, curve, charles, ...
4         [fed, plosser, nasty, weather, curbed, job, gr...
                                ...                        
422414    [surgeons, remove, year, old, rib, rebuild, da...
422415    [boy, surgery, esophagus, battery, burns, hole...
422416    [child, swallowed, battery, reconstructive, su...
422417    [phoenix, boy, undergoes, surgery, repair, thr...
422418    [phoenix, boy, undergoes, surgery, repair, thr...
Name: TITLE, Length: 422419, dtype: object

In [12]:
news['TITLE'] = news['TITLE'].apply(lambda x: ' '.join(x))
news['TITLE']

0         fed official says weak data caused weather slo...
1         fed charles plosser sees high bar change pace ...
2         us open stocks fall fed official hints acceler...
3         fed risks falling behind curve charles plosser...
4               fed plosser nasty weather curbed job growth
                                ...                        
422414    surgeons remove year old rib rebuild damaged t...
422415      boy surgery esophagus battery burns hole throat
422416    child swallowed battery reconstructive surgery...
422417    phoenix boy undergoes surgery repair throat da...
422418    phoenix boy undergoes surgery repair throat da...
Name: TITLE, Length: 422419, dtype: object

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(news['TITLE'],news['CATEGORY'], test_size=0.2, random_state=3)

In [14]:
#Initializing vectorizers
# ngram_range is to combine words - unigram(1,1), bigram(1,2), trigram(1,3)

count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))

In [15]:
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [16]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [17]:
#Multinomial Naive Bayes model fitting

nb_1 = MultinomialNB()
nb_2 = MultinomialNB()

nb_1.fit(X_train_count,Y_train)
nb_2.fit(X_train_tfidf,Y_train)

acc_count_nb = accuracy_score(Y_test,nb_1.predict(X_test_count))
print(acc_count_nb)

acc_tfidf_nb = accuracy_score(Y_test,nb_2.predict(X_test_tfidf))
print(acc_tfidf_nb)

0.9268618910089484
0.9323895648880262


In [18]:
#Logistic Regression Model

logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10))
logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10))

logreg_1.fit(X_train_count,Y_train)
logreg_2.fit(X_train_tfidf,Y_train)

acc_count_logreg = accuracy_score(Y_test, logreg_1.predict(X_test_count))
print(acc_count_logreg)

acc_tfidf_logreg = accuracy_score(Y_test, logreg_2.predict(X_test_tfidf))
print(acc_tfidf_logreg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.9464395625207139
0.9432436911131101
