In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 align="center">Crash Course on NLP using Tensorflow</h1>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer   ## Generate dictionary of word encodings
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Tokenization

In [None]:
sentences = ["I love my dog",
            "I love my cat"]

tokenizer = Tokenizer(num_words = 100)   ## Hyperparameter to select the top 100 words by volume
tokenizer.fit_on_texts(sentences)        ## Encodes the data
word_index = tokenizer.word_index        ## return a dictionary
print(word_index)

If another sentence would be added, with an excalimation mark then it will be not be counted as a different word. Also cleans space and case sensitivites.

In [None]:
sentences = ["I love my dog",
            "i love my cat",
            "You love my dog!"]

tokenizer = Tokenizer(num_words = 100)   ## num_words hyperparameter to select the top 100 unique words by volume
tokenizer.fit_on_texts(sentences)        ## Encodes the data
word_index = tokenizer.word_index        ## return a dictionary
print(word_index)

**The words being encoded into numbers will help in their use with neural networks**

## Text to Sequence

```texts_to_sequences``` will produce the sequence on the same word encoding it has been fitted on or else it will be meaning less. Words which have not been previously encoded to the word index will be lost in sequencing.

In [None]:
sentences = ["I love my dog",
            "i love my cat",
            "You love my dog!",
            "Do you think my dog is amazing?"]

tokenizer = Tokenizer(num_words = 100)   ## num_words hyperparameter to select the top 100 unique words by volume
tokenizer.fit_on_texts(sentences)        ## Encodes the data
word_index = tokenizer.word_index        ## return a dictionary
print(word_index,"\n")

sequences = tokenizer.texts_to_sequences(sentences)    ## Generate the sentence sequnces usingthe encodings
print(sequences)

We can also replace a n unseen word with a special value with a property of the tokenizer called `oov_token = "<OOv>"` to use outer vocabulary words.

In [None]:
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")   ## num_words selects the top 100 unique words by volume and ovv is for outer vocabulary place holder
tokenizer.fit_on_texts(sentences)        ## Encodes the data
word_index = tokenizer.word_index        ## return a dictionary
print(word_index,"\n")

test_data = ["I really love my dog",
            "My dog love his food"]
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

Here unknown words are represented as 1

## Padding
Before we can train with texts we need some level of uniformity of size. This is why padding is required

In [None]:
padded  = pad_sequences(sequences , padding="post", maxlen = 5, truncating = "post")
print(padded)

Each row of the matrix has equal lengths.
1. If you want the padding to be aaplied at the end of the sentence then set `padding = "post"`
2. If you want to limit the size then set `maxlen` accordingly. By deafult the deletion will be pre
3. To make the `maxlen` delete from post set `truncating = "post"`

# Sarcasam Detector (a small experimentation)
> Sarcasm Detection using Hybrid Neural Network
Rishabh Misra, Prahal Arora
Arxiv, August 2019

In [None]:
import json

with open("../input/sarcasm-data-news-headlines/sarcasm.json", 'r') as f:
    datastore = json.load(f)


headlines = [] 
labels = []
urls = []
for i in datastore:                    ## Itterate over the elements
    headlines.append(i['headline'])
    labels.append(i['is_sarcastic'])
    urls.append(i['article_link'])

tokenizer = Tokenizer(oov_token="<OOV>")    ## Most of the common words like in,to,and will be present prety high up the list
tokenizer.fit_on_texts(headlines)

word_index = tokenizer.word_index
print(len(word_index))
print(word_index)
sequences = tokenizer.texts_to_sequences(headlines)
padded = pad_sequences(sequences, padding='post')

In [None]:
print("The sentence is: ",sentences[2],"\n")
print("The sentence sequence is: ",padded[2],"\n")
print("Shape of padded array (no. of sentences X words in each): ",padded.shape)