<a href="https://colab.research.google.com/github/nidhim03/DAT-10-19/blob/main/news_binary_neural_nets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
# the usual
import pandas as pd
import numpy as np
import plotly.express as px

# basic ml steps
from sklearn.model_selection import train_test_split

# nlp stuff

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# neural nets
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.models import Sequential




In [5]:
raw_df=pd.read_csv(r"/content/uci-news-aggregator.csv")

In [6]:
raw_df.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470000000.0
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470000000.0
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470000000.0
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470000000.0
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470000000.0


In [7]:
raw_df.shape

(172584, 8)

In [9]:
raw_df['CATEGORY'].value_counts()

e    63741
b    46720
t    44496
m    17627
Name: CATEGORY, dtype: int64

# Preparing a binary dataset: CATEGORY 1 (b, Business) or 0 (not business)

In [195]:
# Create the binary classification dataset
bin_df=raw_df
conditions=[
            raw_df['CATEGORY']=='b'
]

bin_df['category']=np.select(conditions, '1', '0')
bin_df=bin_df.drop('CATEGORY',axis=1)

# change datatype
bin_df['category']=bin_df['category'].astype('int8')
bin_df['PUBLISHER']=bin_df['PUBLISHER'].astype('str')

# drop HOSTNAME (since it is basically the publisher.)
bin_df=bin_df.drop(['HOSTNAME', 'TIMESTAMP', 'STORY','ID'],axis=1)

In [44]:
# basic EDA and cleanup

In [196]:
bin_df.dtypes

TITLE        object
URL          object
PUBLISHER    object
category       int8
dtype: object

In [46]:
bin_df.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,STORY,category
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,ddUyU0VZz0BRneMioxUPQVP6sIxvM,1
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,ddUyU0VZz0BRneMioxUPQVP6sIxvM,1
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,ddUyU0VZz0BRneMioxUPQVP6sIxvM,1
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,ddUyU0VZz0BRneMioxUPQVP6sIxvM,1
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,ddUyU0VZz0BRneMioxUPQVP6sIxvM,1


In [47]:
# quite a lot of publishers represented in this dataset
bin_df['PUBLISHER'].nunique()

8517

In [48]:
bin_df['PUBLISHER'].value_counts().sort_values(ascending=False)

Reuters                                                  1491
Huffington Post                                          1149
Contactmusic.com                                         1029
Examiner.com                                              998
Los Angeles Times                                         884
                                                         ... 
Waynesville Daily Guide \(blog\)                            1
Tri-State Neighbor                                          1
International Policy Digest                                 1
The Tunis Times                                             1
JAMA_ The Journal of the American Medical Association       1
Name: PUBLISHER, Length: 8517, dtype: int64

In [49]:
# what is a STORY?
bin_df['STORY'].nunique()

3070

In [50]:
# A story seems to refer to a specific news event- in this case, a weather event seems to have affected jobs.
bin_df[bin_df['STORY']=='ddUyU0VZz0BRneMioxUPQVP6sIxvM']['TITLE']

0    Fed official says weak data caused by weather,...
1    Fed's Charles Plosser sees high bar for change...
2    US open: Stocks fall after Fed official hints ...
3    Fed risks falling 'behind the curve', Charles ...
4    Fed's Plosser: Nasty Weather Has Curbed Job Gr...
5    Plosser: Fed May Have to Accelerate Tapering Pace
6            Fed's Plosser: Taper pace may be too slow
7    Fed's Plosser expects US unemployment to fall ...
8    US jobs growth last month hit by weather:Fed P...
Name: TITLE, dtype: object

In [51]:
# This story has something to do with the euro. There are a few sub-themes: banking, strong euro, bad loans. 
bin_df[bin_df['STORY']=='dPhGU51DcrolUIMxbRm0InaHGA2XM']['TITLE']

9     ECB unlikely to end sterilisation of SMP purch...
10    ECB unlikely to end sterilization of SMP purch...
11                EU's half-baked bank union could work
12         Europe reaches crunch point on banking union
13    ECB FOCUS-Stronger euro drowns out ECB's messa...
14           EU aims for deal on tackling failing banks
15    Forex - Pound drops to one-month lows against ...
16    Noyer Says Strong Euro Creates Unwarranted Eco...
17    EU Week Ahead March 10-14: Bank Resolution, Tr...
18    ECB member Noyer is 'very open to all kinds of...
19    Euro Anxieties Wane as Bunds Top Treasuries, S...
20    Noyer Says Strong Euro Creates Unwarranted Eco...
21    Noyer Says Stronger Euro Creates Unwarranted P...
22    Bad loan triggers key feature in ECB bank test...
23    China's trade deficit and structural worries i...
24              10 Things You Need To Know This Morning
25    ECB's Noyer not Happy With Euro Strength -- Up...
26     Eurozone banks' sovereign exposure hits n

# Pre-processing

In [236]:
X_train, X_test, y_train, y_test= train_test_split(bin_df.drop('category',axis=1),bin_df['category'], test_size=0.2, stratify=bin_df['category'] )

In [237]:

# idea: could cut away the publishers with very few counts.  But need to be careful since th publisher name could have keywords eg 'medical journal'

In [238]:
# NLP
# for title, url and publisher

In [239]:
# reference:
# STEP 1: convert a text feature into keywords:

# Tokenizer is the standard way to prep text data
num_words=100000
tokenizer=Tokenizer(num_words=num_words)


In [240]:
# I think TITLE should have enough keywords for the tokenizer- URL and publisher are unlikely to have very different keywords
#maybe publisher has different keywords?
tokenizer.fit_on_texts(X_train['TITLE'])

In [241]:
X_train['TITLE']=tokenizer.texts_to_sequences(X_train['TITLE'])
X_test['TITLE']=tokenizer.texts_to_sequences(X_test['TITLE'])

X_train['URL']=tokenizer.texts_to_sequences(X_train['URL'])
X_test['URL']=tokenizer.texts_to_sequences(X_test['URL'])

X_train['PUBLISHER']=tokenizer.texts_to_sequences(X_train['PUBLISHER'])
X_test['PUBLISHER']=tokenizer.texts_to_sequences(X_test['PUBLISHER'])



In [242]:
# pad the sequences, assuming 80 word max for a TITLE, and 20 for url and 10 for publisher
X_train['TITLE']=pad_sequences(X_train['TITLE'],maxlen=80).tolist()
X_test['TITLE']=pad_sequences(X_test['TITLE'],maxlen=80).tolist()

X_train['URL']=pad_sequences(X_train['URL'],maxlen=20).tolist()
X_test['URL']=pad_sequences(X_test['URL'],maxlen=20).tolist()

X_train['PUBLISHER']=pad_sequences(X_train['PUBLISHER'],maxlen=10).tolist()
X_test['PUBLISHER']=pad_sequences(X_test['PUBLISHER'],maxlen=10).tolist()

In [243]:
X_train

Unnamed: 0,TITLE,URL,PUBLISHER
37198,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 729, 428, 19918, 221, 130, 131, 3370, 6...","[0, 0, 0, 0, 0, 0, 19918, 18903, 1559, 75]"
102574,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 729, 428, 221, 3917, 1131, 527, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1241]"
40205,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 729, 428, 221, 185...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 221]"
92500,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 729, 428, 221, 20, 1885, 7191, 17...","[0, 0, 0, 0, 0, 0, 0, 0, 21604, 609]"
59509,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 729, 428, 75, 8491...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...
172299,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 729, 428, 221, 769, 1131, 7, 4...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 13909]"
77585,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 729, 428, 10066, 221, 3604, 664, 147...","[0, 0, 0, 0, 0, 0, 0, 0, 10066, 221]"
100433,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 729, 3, 10, 4650, 149, 1, 2...","[0, 0, 0, 0, 0, 0, 0, 139, 1348, 4720]"
64179,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 729, 221, 10296, 20, 2737, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 911]"


# Neural Nets Model