# Deep Learning Project

Catarina Palha (M20190156)

Mafalda Zúquete (M20190257)

Maren Leuthner (M20190134)

## Load necessary packages

In [None]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

#preprocessing
from unidecode import unidecode
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

#train/test split
from sklearn.model_selection import train_test_split

#vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

#keras packages
from keras import models
from keras import layers
import tensorflow as tf
from keras import callbacks
from keras.preprocessing import text
from keras.preprocessing import sequence

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

## Read the data

In [None]:
#Read the data
df = pd.read_json('News_Category_Dataset_v2.json', lines = True)

df = df.drop(['link', 'date'], axis = 1)

df.head()

In [None]:
df['text'] = df['headline'] + df['short_description']
df.head()

## Data exploration

In [None]:
# Check null values
print(df.isnull().sum())

In [None]:
word_count = df['text'].apply(lambda x: len(str(x).split(" ")))
df['word_count'] = word_count

df.head()

It can be seen that the size of the documents is homogenous.

In [None]:
df['word_count'].describe()

In [None]:
df.groupby(['category']).sum()

It can be seen that some categories are undersampled.

In [None]:
all_words = ' '.join(df['test']).split()

In [None]:
# get the frequency of the words all over the headlines
freq = pd.Series(all_words).value_counts()

In [None]:
freq[:20]

In [None]:
stop_words = set(stopwords.words("english"))
count = 0

# count the frequent words which are also stop words
for word in freq.index[:20]:
    if word in stop_words:
        count += 1
count

The most common words are also stop words.

In [None]:
def get_top_n_grams(corpus, top_k, n):
    """
    Function that receives a list of documents (corpus) and extracts
        the top k most frequent n-grams for that corpus.
        
    :param corpus: list of texts
    :param top_k: int with the number of n-grams that we want to extract
    :param n: n gram type to be considered 
             (if n=1 extracts unigrams, if n=2 extracts bigrams, ...)
             
    :return: Returns a sorted Pandas DataFrame in which the first column 
        contains the extracted ngrams and the second column contains
        the respective counts
    """
    # get the top 2000 n-grams
    vec = CountVectorizer(ngram_range=(n, n), max_features=2000).fit(corpus)
    
    bag_of_words = vec.transform(corpus)
    
    # count how many times a word appears in the corpora
    sum_words = bag_of_words.sum(axis=0) 
    
    words_freq = []
    for word, idx in vec.vocabulary_.items():
        words_freq.append((word, sum_words[0, idx]))
        
    # save the frequencies in a Pandas DataFrame
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    top_df = pd.DataFrame(words_freq[:top_k])
    top_df.columns = ["Ngram", "Freq"]
    return top_df

In [None]:
def plot_frequencies(top_df):
    """
    Function that receives a Pandas DataFrame from the "get_top_n_grams" function
        and plots the frequencies in a bar plot.
        
    :param top_df: a sorted Pandas DataFrame in which the first column 
        contains the top k ngrams and the second column contains
        the respective counts
    """
    x_labels = top_df["Ngram"][:30]
    y_pos = np.arange(len(x_labels))
    values = top_df["Freq"][:30]
    plt.bar(y_pos, values, align='center', alpha=0.5)
    plt.xticks(y_pos, x_labels)
    plt.ylabel('Frequencies')
    plt.title('Words')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# plot the distribution of the top 20 n-grams with n up to 5
for i in range(1,6):
    top = get_top_n_grams(df["test"], top_k=20, n=i)
    plot_frequencies(top)

## Preprocessing

In [None]:
def preprocessing(dataframe,punctuation=False,tags=False,stemming=False,lemmatizing=False,stopWords=False,
                  lowercasing=False,accents=False):
    """
    Function that receives a Pandas DataFrame with the texts and applies
        the chosen preprocessing techiniques.
        
    :param dataframe: a Pandas DataFrame in which the first column 
        contains the estracted texts the second column contains the
        respective authors
    :param punctuation: bool determining whether or remove punctuation
        and numbers or not (default: False)
    :param tags: bool determining whether to remove tags or not
        (default: False)
    :param stemming: bool determining whether to perform stemming or not
        (default: False)
    :param lemmatizing: bool determining whether to perform lemmatizing 
        or not (default: False)
             
    :return: Returns a list of strings which correspond to each text after
        preprocessing
    """
    
    processed_corpus = []
    
    stop_words = set(stopwords.words("english"))
    
    # for each text in the Pandas DataFrame
    for i in tqdm(range(len(dataframe))):
        text = dataframe[i]
                
        # remove punctuation
        if punctuation:
            text = re.sub('[^a-zA-Z]', ' ', text)

        # remove tags
        if tags:
            text = BeautifulSoup(text).get_text()
        
        # convert to list from str
        text = text.split()

        # stemming
        if stemming:
            stemmer = SnowballStemmer('english')
            
            # don't stem stop words so that they can still be detected
            text = [stemmer.stem(word) for word in text if not word in stop_words]
        
        # lemmatization
        if lemmatizing:
            lemmatizer = WordNetLemmatizer()
            
            text = [lemmatizer.lemmatize(word) for word in text if not word in stop_words]
        
        # removing stop words
        if stopWords:
            text = [word for word in text if not word in stop_words]
        
        # convert to str from list
        text = " ".join(text)
        
        # lowecase the text
        if lowercasing:
            text = text.lower()
        
        # remove accents
        if accents:
            text = unidecode(text)

        # save the preprocessed text on a list
        processed_corpus.append(text)
    return processed_corpus

In [None]:
# preprocess the text and save it in a Pandas Series
cleaned_text = preprocessing(
    df['text'],
    #punctuation=True,
    #tags=True,
    #stemming=True,
    #lemmatizing=True
)
df['clean_text'] = pd.Series(cleaned_text, index = df.index)

In [None]:
df.head()

In [None]:
data = df['clean_text']
target = df['category']

In [None]:
X_train,X,y_train,y = train_test_split(data,target,test_size=0.4,shuffle=True,stratify=target,random_state=0)

In [None]:
X_val,X_test,y_val,y_test = train_test_split(X,y,test_size=0.5,shuffle=True,stratify=y,random_state=0)

In [None]:
y_train = y_train.values.reshape((120511,1))
y_val = y_val.values.reshape((40171,1))
y_test = y_test.values.reshape((40171,1))

In [None]:
y_train = enc.fit_transform(y_train)
y_val = enc.transform(y_val)
y_test = enc.transform(y_test)

## Tests

In [None]:
callbacks_list = [
    callbacks.EarlyStopping(
        monitor = 'val_accuracy',
        patience = 1
    )
]

### DNN

In [None]:
vectorizer = TfidfVectorizer(
    #max_df=0.6,
    #strip_accents='unicode',
    lowercase=False,
    #stop_words=stop_words,
    max_features=157533,
    ngram_range=(1,2)
)

enc = OneHotEncoder()

In [None]:
X_train_dnn = vectorizer.fit_transform(X_train)
X_val_dnn = vectorizer.transform(X_val)
X_test_dnn = vectorizer.transform(X_test)

In [None]:
def dnn(nodes=64,activation='relu',optimizer='rmsprop',loss='categorical_crossentropy'):
    np.random.seed(1)
    tf.random.set_seed(2)
    model = models.Sequential()
    model.add(layers.Dense(nodes,activation=activation,input_shape=(X_train.shape[1],)))
    model.add(layers.Dropout(0.2))
    #model.add(layers.Dense(nodes,activation=activation))
    #model.add(layers.Dropout(0.2))
    #model.add(layers.Dense(nodes,kernel_regularizer=regularizers.l2(0.001),activation=activation))
    model.add(layers.Dense(41,activation='softmax'))
    model.compile(optimizer=optimizer,loss=loss,metrics=['accuracy'])
    return model

In [None]:
model_dnn = dnn(nodes=250)

In [None]:
history_dnn = model_dnn.fit(X_train_dnn,y_train,epochs=100,batch_size=512,callbacks=callbacks_list,
                            validation_data=(X_val_dnn,y_val))

### RNN

In [None]:
#vocabulary_size = 20000
vocabulary_size = 15000
tokenizer = text.Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(X_train)

In [None]:
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_val = tokenizer.texts_to_sequences(X_val)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [None]:
len_sequences = [len(x) for x in sequences_train]
max_len = max(len_sequences)
max_len

In [None]:
X_train_rnn = sequence.pad_sequences(sequences_train, maxlen=max_len)
X_val_rnn = sequence.pad_sequences(sequences_val, maxlen=max_len)
X_test_rnn = sequence.pad_sequences(sequences_test, maxlen=max_len)

#### GRU

In [None]:
def gru(nodes=64,optimizer='rmsprop',loss='categorical_crossentropy'):
    np.random.seed(1)
    tf.random.set_seed(2)
    model = models.Sequential()
    model.add(layers.Embedding(vocabulary_size, nodes, input_length=150))
    model.add(layers.GRU(nodes,
                         #dropout=0.5,
                         #recurrent_dropout=0.5,
                         input_shape=(None,X_train.shape[-1]),
                         #return_sequences=True
                        ))
    model.add(layers.Dense(41, activation='softmax'))
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
model_gru = gru()

In [None]:
history_gru = model_gru.fit(X_train_rnn,y_train,epochs=100,batch_size=256,callbacks=callbacks_list,
                            validation_data=(X_val_rnn,y_val))

#### LSTM

In [None]:
def lstm(nodes=64,optimizer='rmsprop',loss='categorical_crossentropy'):
    np.random.seed(1)
    tf.random.set_seed(2)
    model = models.Sequential()
    model.add(layers.Embedding(vocabulary_size, nodes, input_length=max_len))
    model.add(layers.LSTM(nodes,
                          #dropout=0.2,
                          #recurrent_dropout=0.2
                         ))
    model.add(layers.Dense(41, activation='softmax'))
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
model_lstm = lstm()

In [None]:
history_lstm = model_lstm.fit(X_train_rnn,y_train,epochs=100,batch_size=512,callbacks=callbacks_list,
                              validation_data=(X_val_rnn,y_val))

#### CNN + GRU

In [None]:
def cnn_gru(nodes=64,window=5,activation='relu',pooling=3,optimizer='rmsprop',loss='categorical_crossentropy'):
    np.random.seed(1)
    tf.random.set_seed(2)
    model = models.Sequential()
    model.add(layers.Conv1D(nodes,window,activation=activation,input_shape=(None,X_train.shape[-1])))
    model.add(layers.MaxPooling1D(pooling))
    model.add(layers.GRU(nodes))#,
                         #return_sequences=True))
                         #dropout=0.2,
                         #recurrent_dropout=0.2))
    model.add(layers.Dense(41, activation='softmax'))
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
model_cnn_gru = cnn_gru()

In [None]:
history_cnn_gru = model_cnn_gru.fit(X_train_rnn,y_train,epochs=100,batch_size=512,callbacks=callbacks_list,
                                    validation_data=(X_val_rnn,y_val))

#### CNN + LSTM

In [None]:
def cnn_lstm(nodes=64,window=5,activation='relu',pooling=3,optimizer='rmsprop',loss='categorical_crossentropy'):
    np.random.seed(1)
    tf.random.set_seed(2)
    model = models.Sequential()
    model.add(layers.Conv1D(nodes,window,activation=activation,input_shape=(None,X_train.shape[-1])))
    model.add(layers.MaxPooling1D(pooling))
    model.add(layers.LSTM(nodes))#,
                         #return_sequences=True))
                         #dropout=0.2,
                         #recurrent_dropout=0.2))
    model.add(layers.Dense(41, activation='softmax'))
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
model_cnn_lstm = cnn_lstm()

In [None]:
history_cnn_lstm = model_cnn_lstm.fit(X_train_rnn,y_train,epochs=100,batch_size=512,callbacks=callbacks_list,
                                      validation_data=(X_val_rnn,y_val))

## Train the chosen models

In [None]:
callbacks_list_dnn = [
    callbacks.EarlyStopping(
        monitor = 'val_accuracy',
        patience = 10
    ),
    callbacks.ModelCheckpoint(
        filepath = 'dnn.h5',
        monitor = 'val_accuracy',
        save_best_only = True
    )
]

callbacks_list_rnn = [
    callbacks.EarlyStopping(
        monitor = 'val_accuracy',
        patience = 10
    ),
    callbacks.ModelCheckpoint(
        filepath = 'rnn.h5',
        monitor = 'val_accuracy',
        save_best_only = True
    )
]

In [None]:
history_dnn = model_dnn.fit(X_train_dnn,y_train,epochs=100,batch_size=512,callbacks=callbacks_list_dnn,
                            validation_data=(X_val_dnn,y_val))

In [None]:
history_gru = model_gru.fit(X_train_rnn,y_train,epochs=100,batch_size=256,callbacks=callbacks_list_rnn,
                            validation_data=(X_val_rnn,y_val))

## Classify new data

In [None]:
model_dnn.load_weights('dnn.h5')
model_gru.load_weights('rnn.h5')

In [None]:
model_dnn.evaluate(X_test_dnn,y_test)

In [None]:
model_gru.evaluate(X_test_rnn,y_test)