<a href="https://colab.research.google.com/github/nihar-max/TensorFlow-Deeplearning/blob/master/text_classification_tensorflow_multiclass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import os
import datetime
import tensorflow_hub as hub
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv('/content/drive/MyDrive/NLP Project Board Infinity Capstone/Topic_Classification/topic_classification_data.csv')

In [None]:
df.columns

Index(['Unnamed: 0', 'content', 'label'], dtype='object')

In [None]:
df.label.value_counts()

Politics     37797
Health       36952
Emotion      28362
Financial    22981
Sport         6077
Science       4634
Name: label, dtype: int64

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,content,label
0,0,I'm struggling to understand how I feel about ...,Emotion
1,1,NEW: Modi's promise to ban plastic straws this...,Financial
2,2,SURVEY: Middle-Income Americans Spending Less ...,Financial
3,3,WATCH: Wall Street ended lower after bank stoc...,Financial
4,4,Someone selling a ton of $SE puts to open Sep ...,Financial


### Train Test Split 80:20

In [None]:
df.dropna(inplace = True)

In [None]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=111)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight ='balanced',classes=np.unique(df['label']),y = df['label'])

## Class weight = balanced is used when we have imbalanced data

In [None]:
class_weights.sort()

In [None]:
df.label.value_counts()

Politics     37797
Health       36947
Emotion      28362
Financial    22981
Sport         6077
Science       4634
Name: label, dtype: int64

In [None]:
class_weights.tolist()

[0.6032136589323668,
 0.6170911485822034,
 0.8038807794466775,
 0.9921094237268469,
 3.751796390762986,
 4.920083441231477]

- Note:  class_weights will assign the lesser weights to those labels which are having higher data-points and higher weights to labels with less data-points

### Assign these weights to dict with index

In [None]:
weights={}


for index, weight in enumerate(class_weights) :
  weights[index]=weight

weights

{0: 0.6032136589323668,
 1: 0.6170911485822034,
 2: 0.8038807794466775,
 3: 0.9921094237268469,
 4: 3.751796390762986,
 5: 4.920083441231477}

In [None]:
df.columns

Index(['Unnamed: 0', 'content', 'label'], dtype='object')

In [None]:
dataset_train = tf.data.Dataset.from_tensor_slices((X_train['content'].values, X_train['label'].values))
dataset_test = tf.data.Dataset.from_tensor_slices((X_test['content'].values, X_test['label'].values))

In [None]:
for text, target in dataset_train.take(5):
  print ('content: {}, Target: {}'.format(text, target))

content: b'What are the signs and symptoms of Ghosal hematodiaphyseal dysplasia syndrome? The Human Phenotype Ontology provides the following list of signs and symptoms for Ghosal hematodiaphyseal dysplasia syndrome. If the information is available, the table below includes how often the symptom is seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these medical terms. Signs and Symptoms Approximate number of patients (when available) Abnormal cortical bone morphology 90% Abnormal form of the vertebral bodies 90% Abnormality of immune system physiology 90% Abnormality of pelvic girdle bone morphology 90% Abnormality of the femur 90% Abnormality of the metaphyses 90% Abnormality of the tibia 90% Bowing of the long bones 90% Craniofacial hyperostosis 90% Neurological speech impairment 7.5% Splenomegaly 7.5% Hyperostosis cranialis interna 5% Leukopenia 5% Autosomal recessive inheritance - Bone marrow hypocellularity - Diaphyse

In [None]:
for text, target in dataset_test.take(5):
  print ('content: {}, Target: {}'.format(text, target))

content: b'French Open Winner!', Target: b'Sport'
content: b'In the days leading up to the Super Bowl, there will be plenty of talk about the Baltimore Ravens and the San Francisco 49ersBaltimore vs. San Francisco: A Health Showdown', Target: b'Health'
content: b'Open the curtains to let sunlight in Spend the first 30-60 minutes after waking outside or in a room with bright lights EncourageMorning Mood: Why Some People Wake Up Grumpy', Target: b'Health'
content: b"This month's top 10 list pulls evenly from the worlds of rock and pop.  On the former front, you've got bands like Bastille, Imagine Dragons, and NONONO.  On the latter, you've got dancefloor divas like Lady GaGa, Rihanna, and Katy Perry.The Top 10 Workout Songs for November 2013", Target: b'Health'
content: b"There's not too much public appetite to see Republicans take another shot at repealing Obamacare.Barely Anyone Is Mourning The Demise Of The GOP's Health Care Bill", Target: b'Politics'


### Convert Target label into Numerical representation using lookup.StaticHashTable

In [None]:
df.label.value_counts()

Politics     37797
Health       36947
Emotion      28362
Financial    22981
Sport         6077
Science       4634
Name: label, dtype: int64

In [None]:
table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant(['Politics', 'Health', 'Emotion', 'Financial', 'Sport','Science']),
        values=tf.constant([0, 1, 2, 3, 4, 5]),
    ),
    default_value=tf.constant(-1),
    name="target_encoding"
)

@tf.function
def target(x):
  return table.lookup(x)

In [None]:
def show_batch(dataset, size=5):
  for batch, label in dataset.take(size):
      print(batch.numpy())
      print(target(label).numpy())

In [None]:
print(show_batch(dataset_test,6))

b'French Open Winner!'
4
b'In the days leading up to the Super Bowl, there will be plenty of talk about the Baltimore Ravens and the San Francisco 49ersBaltimore vs. San Francisco: A Health Showdown'
1
b'Open the curtains to let sunlight in Spend the first 30-60 minutes after waking outside or in a room with bright lights EncourageMorning Mood: Why Some People Wake Up Grumpy'
1
b"This month's top 10 list pulls evenly from the worlds of rock and pop.  On the former front, you've got bands like Bastille, Imagine Dragons, and NONONO.  On the latter, you've got dancefloor divas like Lady GaGa, Rihanna, and Katy Perry.The Top 10 Workout Songs for November 2013"
1
b"There's not too much public appetite to see Republicans take another shot at repealing Obamacare.Barely Anyone Is Mourning The Demise Of The GOP's Health Care Bill"
0
b'"When you\'re given a project like this, you look at the universe of hazards," said Mansour Samadpour, CEO of IEH Laboratories, which was hired by Chipotle to tig

In [None]:
def fetch(text, labels):
  return text, tf.one_hot(target(labels),6)

# One hot encoding for labels

In [None]:
train_data_f=dataset_train.map(fetch)
test_data_f=dataset_test.map(fetch)

In [None]:
next(iter(train_data_f))

(<tf.Tensor: shape=(), dtype=string, numpy=b'What are the signs and symptoms of Ghosal hematodiaphyseal dysplasia syndrome? The Human Phenotype Ontology provides the following list of signs and symptoms for Ghosal hematodiaphyseal dysplasia syndrome. If the information is available, the table below includes how often the symptom is seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these medical terms. Signs and Symptoms Approximate number of patients (when available) Abnormal cortical bone morphology 90% Abnormal form of the vertebral bodies 90% Abnormality of immune system physiology 90% Abnormality of pelvic girdle bone morphology 90% Abnormality of the femur 90% Abnormality of the metaphyses 90% Abnormality of the tibia 90% Bowing of the long bones 90% Craniofacial hyperostosis 90% Neurological speech impairment 7.5% Splenomegaly 7.5% Hyperostosis cranialis interna 5% Leukopenia 5% Autosomal recessive inheritance - Bone

In [None]:
train_data, train_labels = next(iter(train_data_f.batch(5)))
train_data, train_labels

(<tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'What are the signs and symptoms of Ghosal hematodiaphyseal dysplasia syndrome? The Human Phenotype Ontology provides the following list of signs and symptoms for Ghosal hematodiaphyseal dysplasia syndrome. If the information is available, the table below includes how often the symptom is seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these medical terms. Signs and Symptoms Approximate number of patients (when available) Abnormal cortical bone morphology 90% Abnormal form of the vertebral bodies 90% Abnormality of immune system physiology 90% Abnormality of pelvic girdle bone morphology 90% Abnormality of the femur 90% Abnormality of the metaphyses 90% Abnormality of the tibia 90% Bowing of the long bones 90% Craniofacial hyperostosis 90% Neurological speech impairment 7.5% Splenomegaly 7.5% Hyperostosis cranialis interna 5% Leukopenia 5% Autosomal recessive inherit

### Model Creation

In [None]:
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
## This is token based pre-trained embedding layer from google
## If we use this then we dont need to add any tokenizing parameter
# This will automatically tokenize
hub_layer = hub.KerasLayer(embedding, output_shape=[128], input_shape=[],
                           dtype=tf.string, trainable=True)
hub_layer(train_data[:1])

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[ 1.2053051 , -0.08322185, -0.05423499,  0.17402476, -0.21350299,
        -0.02764736,  0.05173753,  0.17013885, -0.07923746,  0.36298877,
         0.51904446,  0.00687981, -0.21523206,  0.03835091, -0.42452356,
        -0.34027886, -0.16229816, -0.11779678, -0.7106336 ,  1.3692514 ,
         0.4675015 ,  0.4009955 ,  0.06179972, -0.3235033 ,  0.23394756,
         0.24553956,  0.11262168,  0.15114212, -0.31338844,  0.17913602,
         0.34809113, -0.06872581,  0.14297883, -0.36870715,  0.10842151,
         0.11336714,  0.0268288 , -0.37192115,  0.2038767 ,  0.19604897,
        -0.4874661 , -0.33968228, -0.1736156 , -0.1789945 ,  0.23409852,
         0.05984085, -0.2895584 , -0.05227894, -0.58570474, -0.03630695,
         0.5857808 , -0.00930684,  0.52927077,  0.01397437,  0.41049212,
        -0.01233702,  0.25621188, -0.00622909,  0.37954834,  0.3421283 ,
         0.21881713,  0.21780081,  0.09485367, -0.41357788, -0.09060962,
 

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
# Hub layer is embeded layer we are using above
for units in [128, 128, 64 , 32]:
  # 4 hidden layers
  model.add(tf.keras.layers.Dense(units, activation='relu'))
  model.add(tf.keras.layers.Dropout(0.3))
  # Dropout of 30% in each hidden layer
model.add(tf.keras.layers.Dense(6, activation='softmax'))
# Softmax activation for multiclass classification

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 128)               124642688 
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0

In [None]:

model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
train_data_f=train_data_f.shuffle(100000).batch(512)
test_data_f=test_data_f.batch(512)

In [None]:
history = model.fit(train_data_f,
                    epochs=4,
                    validation_data=test_data_f,
                    verbose=1,
                    class_weight=weights)

## Class_weight = weights for the assigned weight scores for all labels to treat them equal
# When we use class_weights then it use weighted cross entropy func in tensorflow

Epoch 1/4


  output, from_logits = _get_logits(


Epoch 2/4
Epoch 3/4
Epoch 4/4


- Train Acc: 96%
- Test Acc:  89%

In [None]:
len(list(dataset_test))

27360

In [None]:
results = model.evaluate(dataset_test.map(fetch).batch(27360), verbose=2)

print(results)

1/1 - 1s - loss: 0.4395 - accuracy: 0.8906 - 1s/epoch - 1s/step
[0.43951743841171265, 0.8905701637268066]


In [None]:
test_data, test_labels = next(iter(dataset_test.map(fetch).batch(45963)))

y_pred=model.predict(test_data)



In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      7525
           1       0.93      0.88      0.90      7275
           2       0.95      0.96      0.96      5825
           3       0.85      0.86      0.86      4541
           4       0.80      0.85      0.83      1228
           5       0.50      0.69      0.58       966

    accuracy                           0.89     27360
   macro avg       0.83      0.86      0.84     27360
weighted avg       0.90      0.89      0.89     27360



In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1))

array([[6780,  112,   43,  313,  141,  136],
       [ 164, 6383,  149,  218,   41,  320],
       [  25,   75, 5595,   40,   16,   74],
       [ 292,  152,   55, 3888,   44,  110],
       [  88,   28,   10,   32, 1049,   21],
       [  63,  108,   40,   61,   23,  671]])