In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! head /kaggle/input/10dataset-text-document-classification/business/business_80.txt


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

In [None]:
## we will be removing all the stopwords from all the text file and stemming the text.

from nltk.corpus import stopwords
import nltk

stop_words = stopwords.words('english')
prt = nltk.stem.PorterStemmer()

def preprocess(document_path):
    
    with open(document_path, 'r') as file:
        document = file.read()
        tokens = document.split(" ")
#     tokens = nltk.word_tokenize(document)

        tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]

        tokens_stop = [i for i in tokens_pun_lower if i not in stop_words]

    #terms = [prt.stem(i) for i in tokens_stop]
    
    return " ".join(tokens_stop)
import os
Data = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        
        doc_class = filename.split('_')[0].lower()
        doc_titles = filename
        documents = preprocess(os.path.join(dirname, filename))
        
        Data.append([doc_titles, documents, doc_class])

df = pd.DataFrame (Data, columns = ['Title', 'Document', 'Class'])

In [None]:
df.head()


In [None]:
df.to_csv("documents_raw.csv", index=False)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
token = tokenizer.encode_plus(
    df['Document'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [None]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['Document'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

In [None]:
labels = np.zeros((len(df), 10))
labels.shape

In [None]:
df['Class'] = pd.factorize(df['Class'])[0]
df.head()

In [None]:
df.to_csv("documents.csv", index=False)

In [None]:
labels[np.arange(len(df)), df['Class'].values.astype(int)] = 1

In [None]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data
     

In [None]:
def ModelDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
dataset = dataset.map(ModelDatasetMapFunction)

In [None]:
dataset.take(1)

In [None]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [None]:
p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train

In [None]:
train_size

In [None]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)


In [None]:
from transformers import TFBertModel

In [None]:
model = TFBertModel.from_pretrained('bert-base-cased')

In [None]:
from tensorflow.keras import regularizers
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')
bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer', kernel_regularizer=regularizers.l2(0.01))(bert_embds)
drop_out = tf.keras.layers.Dropout(0.2, name="dropout")(intermediate_layer)
output_layer = tf.keras.layers.Dense(10, activation='softmax', name='output_layer', kernel_regularizer=regularizers.l2(0.01))(drop_out) # softmax -> calcs probs of classes

model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
model.summary()

In [None]:
learning_rate_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-5, decay_rate=1e-6, decay_steps=10000)

optim = tf.keras.optimizers.Adam(learning_rate=learning_rate_schedule)

loss_func = tf.keras.losses.CategoricalCrossentropy()

acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
precision = tf.keras.metrics.Precision(name='precision')
recall = tf.keras.metrics.Recall(name='recall')

model.compile(optimizer=optim, loss=loss_func, metrics=[acc, precision, recall])

In [None]:
hist =model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)

In [None]:
model.save("textclassification.h5")

In [None]:
model.evaluate(val_dataset)

In [None]:
import matplotlib.pyplot as plt
train_acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']

# Plot the epoch vs accuracy graph
plt.plot(range(1, len(train_acc) + 1), train_acc, label='Training Accuracy')
plt.plot(range(1, len(val_acc) + 1), val_acc, label='Validation Accuracy')
plt.title('Epoch vs Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
loss,accuracy,precision,recall=model.evaluate(val_dataset)
print()
print()
print()
print()
print("loss is : ", loss)

print("accuracy is: ", str(round(accuracy*100))+ "%")
print("recall is: ", str(round(recall*100)) + "%")
print("Precision is : ", str(round(precision*100)) + "%")

In [None]:
model.save('model')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['business', 'Entertainment', 'food', 'Graphichs', 'historical','medical','politcis', 'space','sport','technology']):
    probs = model.predict(processed_data)[0]
    return np.argmax(probs)

In [None]:
dict1 = {0:"Space", 1:"Politics",2:"Sport",3:"technology",4:"historical", 5:"Medical", 6:"Graphics",7:"Entertrainment",8:"Food",9:"business"}

In [None]:
text = """Bank voted 8-1 for no rate change

The decision to keep interest rates on hold at 4.75% earlier this month was passed 8-1 by the Bank of England's rate-setting body, minutes have shown.

One member of the Bank's Monetary Policy Committee (MPC) - Paul Tucker - voted to raise rates to 5%. The news surprised some analysts who had expected the latest minutes to show another unanimous decision. Worries over growth rates and consumer spending were behind the decision to freeze rates, the minutes showed. The Bank's latest inflation report, released last week, had noted that the main reason inflation might fall was weaker consumer spending.

However, MPC member Paul Tucker voted for a quarter point rise in interest rates to 5%. He argued that economic growth was picking up, and that the equity, credit and housing markets had been stronger than expected.

The Bank's minutes said that risks to the inflation forecast were "sufficiently to the downside" to keep rates on hold at its latest meeting. However, the minutes added: "Some members noted that an increase might be warranted in due course if the economy evolved in line with the central projection". Ross Walker, UK economist at Royal Bank of Scotland, said he was surprised that a dissenting vote had been made so soon. He said the minutes appeared to be "trying to get the market to focus on the possibility of a rise in rates". "If the economy pans out as they expect then they are probably going to have to hike rates." However, he added, any rate increase is not likely to happen until later this year, with MPC members likely to look for a more sustainable pick up in consumer spending before acting.
"""

In [None]:
input_text = text
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(model, processed_data=processed_data)
print(f"Predicted model: {result}")
     

In [None]:
output = dict1[result]

In [None]:
result

In [None]:
print(output)