In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

#preprocessing
from unidecode import unidecode
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder

#train/test split
from sklearn.model_selection import train_test_split

#vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#performance metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#keras packages
from keras import models
from keras import layers
import tensorflow as tf
from keras import callbacks
from keras.preprocessing import text
from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mafalda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mafalda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#Read the data
df = pd.read_json('News_Category_Dataset_v2.json', lines = True)

df = df.drop(['link', 'date'], axis = 1)

df.head()

Unnamed: 0,category,headline,authors,short_description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,She left her husband. He killed their children...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,Of course it has a song.
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,The actor and his longtime girlfriend Anna Ebe...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,The actor gives Dems an ass-kicking for not fi...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,"The ""Dietland"" actress said using the bags is ..."


In [5]:
df['text'] = df['headline'] + df['short_description']
df.head()

Unnamed: 0,category,headline,authors,short_description,text
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 5...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...


In [6]:
def preprocessing(dataframe,punctuation=False,tags=False,stemming=False,lemmatizing=False,stopWords=False,
                  lowercasing=False,accents=False):
    """
    Function that receives a Pandas DataFrame with the texts and applies
        the chosen preprocessing techiniques.
        
    :param dataframe: a Pandas DataFrame in which the first column 
        contains the estracted texts the second column contains the
        respective authors
    :param punctuation: bool determining whether or remove punctuation
        and numbers or not (default: False)
    :param tags: bool determining whether to remove tags or not
        (default: False)
    :param stemming: bool determining whether to perform stemming or not
        (default: False)
    :param lemmatizing: bool determining whether to perform lemmatizing 
        or not (default: False)
             
    :return: Returns a list of strings which correspond to each text after
        preprocessing
    """
    
    processed_corpus = []
    
    stop_words = set(stopwords.words("english"))
    
    # for each text in the Pandas DataFrame
    for i in tqdm(range(len(dataframe))):
        text = dataframe[i]
                
        # remove punctuation
        if punctuation:
            text = re.sub('[^a-zA-Z]', ' ', text)

        # remove tags
        if tags:
            text = BeautifulSoup(text).get_text()
        
        # convert to list from str
        text = text.split()

        # stemming
        if stemming:
            stemmer = SnowballStemmer('english')
            
            # don't stem stop words so that they can still be detected
            text = [stemmer.stem(word) for word in text if not word in stop_words]
        
        # lemmatization
        if lemmatizing:
            lemmatizer = WordNetLemmatizer()
            
            text = [lemmatizer.lemmatize(word) for word in text if not word in stop_words]
        
        # removing stop words
        if stopWords:
            text = [word for word in text if not word in stop_words]
        
        # convert to str from list
        text = " ".join(text)
        
        # lowecase the text
        if lowercasing:
            text = text.lower()
        
        # remove accents
        if accents:
            text = unidecode(text)

        # save the preprocessed text on a list
        processed_corpus.append(text)
    return processed_corpus

In [7]:
# preprocess the text and save it in a Pandas Series
cleaned_text = preprocessing(
    df['text'],
    punctuation=True,
    #tags=True,
    #stemming=True,
    #lemmatizing=True,
    #stopWords=True,
    #lowercasing=True,
    #accents=True
)
df['clean_text'] = pd.Series(cleaned_text, index = df.index)

HBox(children=(FloatProgress(value=0.0, max=200853.0), HTML(value='')))




In [8]:
df.head()

Unnamed: 0,category,headline,authors,short_description,text,clean_text
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,There Were Mass Shootings In Texas Last Week B...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,Will Smith Joins Diplo And Nicky Jam For The W...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 5...,Hugh Grant Marries For The First Time At Age T...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Jim Carrey Blasts Castrato Adam Schiff And Dem...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,Julianna Margulies Uses Donald Trump Poop Bags...


In [9]:
data = df['clean_text']
target = df['category']

In [10]:
#vocabulary_size = 20000
vocabulary_size = 15000
tokenizer = text.Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(data)

In [11]:
sequences = tokenizer.texts_to_sequences(data)
len_sequences = [len(x) for x in sequences]
max_len = max(len_sequences)

In [12]:
max_len

245

In [13]:
data = sequence.pad_sequences(sequences, maxlen=150)

In [14]:
X_train,X,y_train,y = train_test_split(data,target,test_size=0.4,shuffle=True,stratify=target,random_state=0)

In [15]:
X_val,X_test,y_val,y_test = train_test_split(X,y,test_size=0.5,shuffle=True,stratify=y,random_state=0)

In [16]:
#X_train = X_train.reshape((X_train.shape[0],X_train.shape[1],1))
#X_val = X_val.reshape((X_val.shape[0],X_val.shape[1],1))

In [17]:
X_train.shape

(120511, 150)

In [18]:
y_train = y_train.values.reshape((120511,1))
y_val = y_val.values.reshape((40171,1))

In [19]:
enc = OneHotEncoder()

In [20]:
y_train = enc.fit_transform(y_train)
y_val = enc.transform(y_val)

In [21]:
def gru(nodes=64,optimizer='rmsprop',loss='categorical_crossentropy'):
    np.random.seed(1)
    tf.random.set_seed(2)
    model = models.Sequential()
    model.add(layers.Embedding(vocabulary_size, nodes, input_length=150))
    model.add(layers.GRU(nodes,
                         #dropout=0.5,
                         #recurrent_dropout=0.5,
                         input_shape=(None,X_train.shape[-1]),
                         #return_sequences=True
                        ))
    model.add(layers.Dense(41, activation='softmax'))
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model

In [22]:
model = gru()

In [23]:
callbacks_list = [
    callbacks.EarlyStopping(
        monitor = 'val_accuracy',
        patience = 1
    ),
    callbacks.ModelCheckpoint(
        filepath = 'gru.h5',
        monitor = 'val_accuracy',
        save_best_only = True
    )
]

In [24]:
history = model.fit(X_train,y_train,epochs=100,batch_size=256,callbacks=callbacks_list,validation_data=(X_val,y_val))

Train on 120511 samples, validate on 40171 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
