In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
sentence = [
    'আমার একটি গরু আছে',
    'আমার একটি মহিষ আছে',
    'তোমার একটি গরু আছে!',
    'তুমি কি মনে কর আমার গরু তরতাজা'
]

In [3]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)
print(word_index)
print(sequences)

{'আমার': 1, 'একটি': 2, 'গরু': 3, 'আছে': 4, 'মহিষ': 5, 'তোমার': 6, 'তুমি': 7, 'কি': 8, 'মনে': 9, 'কর': 10, 'তরতাজা': 11}
[[1, 2, 3, 4], [1, 2, 5, 4], [6, 2, 3, 4], [7, 8, 9, 10, 1, 3, 11]]


In [4]:
test_data = [
    'আমার আসলেই একটু গরু আছে',
    'আমার গরু ঘাস খায়'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[1, 3, 4], [1, 3]]


In [5]:
tokenizer = Tokenizer(num_words=100,oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)
print(word_index)
print(sequences)

{'<OOV>': 1, 'আমার': 2, 'একটি': 3, 'গরু': 4, 'আছে': 5, 'মহিষ': 6, 'তোমার': 7, 'তুমি': 8, 'কি': 9, 'মনে': 10, 'কর': 11, 'তরতাজা': 12}
[[2, 3, 4, 5], [2, 3, 6, 5], [7, 3, 4, 5], [8, 9, 10, 11, 2, 4, 12]]


In [6]:
test_data = [
    'আমার আসলেই একটু গরু আছে',
    'আমার গরু ঘাস খায়'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[2, 1, 1, 4, 5], [2, 4, 1, 1]]


# Padding

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [8]:
sentence = [
    'আমার একটি গরু আছে',
    'আমার একটি মহিষ আছে',
    'তোমার একটি গরু আছে!',
    'তুমি কি মনে কর আমার গরু তরতাজা'
]

In [9]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences)
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'আমার': 2, 'একটি': 3, 'গরু': 4, 'আছে': 5, 'মহিষ': 6, 'তোমার': 7, 'তুমি': 8, 'কি': 9, 'মনে': 10, 'কর': 11, 'তরতাজা': 12}
[[2, 3, 4, 5], [2, 3, 6, 5], [7, 3, 4, 5], [8, 9, 10, 11, 2, 4, 12]]
[[ 0  0  0  2  3  4  5]
 [ 0  0  0  2  3  6  5]
 [ 0  0  0  7  3  4  5]
 [ 8  9 10 11  2  4 12]]


# padding post

In [10]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, padding='post')
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'আমার': 2, 'একটি': 3, 'গরু': 4, 'আছে': 5, 'মহিষ': 6, 'তোমার': 7, 'তুমি': 8, 'কি': 9, 'মনে': 10, 'কর': 11, 'তরতাজা': 12}
[[2, 3, 4, 5], [2, 3, 6, 5], [7, 3, 4, 5], [8, 9, 10, 11, 2, 4, 12]]
[[ 2  3  4  5  0  0  0]
 [ 2  3  6  5  0  0  0]
 [ 7  3  4  5  0  0  0]
 [ 8  9 10 11  2  4 12]]


# Padding maxlen

In [11]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, padding='post',maxlen=5)
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'আমার': 2, 'একটি': 3, 'গরু': 4, 'আছে': 5, 'মহিষ': 6, 'তোমার': 7, 'তুমি': 8, 'কি': 9, 'মনে': 10, 'কর': 11, 'তরতাজা': 12}
[[2, 3, 4, 5], [2, 3, 6, 5], [7, 3, 4, 5], [8, 9, 10, 11, 2, 4, 12]]
[[ 2  3  4  5  0]
 [ 2  3  6  5  0]
 [ 7  3  4  5  0]
 [10 11  2  4 12]]


In [12]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, maxlen=5)
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'আমার': 2, 'একটি': 3, 'গরু': 4, 'আছে': 5, 'মহিষ': 6, 'তোমার': 7, 'তুমি': 8, 'কি': 9, 'মনে': 10, 'কর': 11, 'তরতাজা': 12}
[[2, 3, 4, 5], [2, 3, 6, 5], [7, 3, 4, 5], [8, 9, 10, 11, 2, 4, 12]]
[[ 0  2  3  4  5]
 [ 0  2  3  6  5]
 [ 0  7  3  4  5]
 [10 11  2  4 12]]


# Padding Truncating 

In [13]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, padding='post',maxlen=5,truncating='post')
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'আমার': 2, 'একটি': 3, 'গরু': 4, 'আছে': 5, 'মহিষ': 6, 'তোমার': 7, 'তুমি': 8, 'কি': 9, 'মনে': 10, 'কর': 11, 'তরতাজা': 12}
[[2, 3, 4, 5], [2, 3, 6, 5], [7, 3, 4, 5], [8, 9, 10, 11, 2, 4, 12]]
[[ 2  3  4  5  0]
 [ 2  3  6  5  0]
 [ 7  3  4  5  0]
 [ 8  9 10 11  2]]


In [14]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, maxlen=5,padding='pre', truncating='pre')
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'আমার': 2, 'একটি': 3, 'গরু': 4, 'আছে': 5, 'মহিষ': 6, 'তোমার': 7, 'তুমি': 8, 'কি': 9, 'মনে': 10, 'কর': 11, 'তরতাজা': 12}
[[2, 3, 4, 5], [2, 3, 6, 5], [7, 3, 4, 5], [8, 9, 10, 11, 2, 4, 12]]
[[ 0  2  3  4  5]
 [ 0  2  3  6  5]
 [ 0  7  3  4  5]
 [10 11  2  4 12]]


# Pandas read_json file

In [20]:
# import pandas as pd 
# datastore = pd.read_json('/Users/msjahid/Desktop/NLP/sarcasm.json')
# datastore.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [23]:
import json
with open('/Users/msjahid/Desktop/NLP/sarcasm.json', 'r') as f:
    datastore = json.load(f)

In [24]:
sentences = []
lable = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    lable.append(item['is_sarcastic'])
    urls.append(item['article_link'])