In [123]:
text ="""Data analysis is the process of inspecting, cleansing, transforming, and modeling data with the goal of discovering useful information, informing conclusions, and supporting decision-making.
Data analysis has multiple facets and approaches, encompassing diverse techniques under a variety of names, and is used in different business, science, and social science domains.
In today's business world, data analysis plays a role in making decisions more scientific and helping businesses operate more effectively.
Data mining is a particular data analysis technique that focuses on statistical modeling and knowledge discovery for predictive rather than purely descriptive purposes, while business intelligence covers data analysis that relies heavily on aggregation, focusing mainly on business information.
In statistical applications, data analysis can be divided into descriptive statistics, exploratory data analysis (EDA), and confirmatory data analysis (CDA).
EDA focuses on discovering new features in the data while CDA focuses on confirming or falsifying existing hypotheses.
Predictive analytics focuses on the application of statistical models for predictive forecasting or classification, while text analytics applies statistical, linguistic, and structural techniques to extract and classify information from textual sources, a variety of unstructured data.
All of the above are varieties of data analysis."""

In [124]:
import tensorflow as tf  
from tensorflow.keras.preprocessing.text import Tokenizer  

In [125]:
 # initiate the tokenizer
tokenizer =Tokenizer()

In [126]:
tokenizer.fit_on_texts([text]) 

In [127]:
len(tokenizer.word_index)

111

In [128]:
input_sequence=[]
for sentence in text.split('\n'): 
    tokenize_sentance = tokenizer.texts_to_sequences([sentence])[0]  
    for i in range(1,len(tokenize_sentance)): 
        input_sequence.append(tokenize_sentance[:i+1]) 

In [129]:
input_sequence

[[1, 3],
 [1, 3, 12],
 [1, 3, 12, 6],
 [1, 3, 12, 6, 30],
 [1, 3, 12, 6, 30, 4],
 [1, 3, 12, 6, 30, 4, 31],
 [1, 3, 12, 6, 30, 4, 31, 32],
 [1, 3, 12, 6, 30, 4, 31, 32, 33],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34, 6],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34, 6, 35],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34, 6, 35, 4],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34, 6, 35, 4, 17],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34, 6, 35, 4, 17, 36],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34, 6, 35, 4, 17, 36, 13],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34, 6, 35, 4, 17, 36, 13, 37],
 [1, 3, 12, 6, 30, 4, 31, 32, 33, 2, 16, 1, 34, 6, 35, 4, 17, 36, 13, 37, 38],
 [1,
  3,
  12,
  6,
  30,
  4,
  31,
  32,
  33,
  2,
  16,
  1,
  34,
  6,
  35,
  4,
  17,
  36,
  13,
  37,
  38,
  2],
 [1,
 

In [158]:
max([len(x) for x in input_sequence])

39

In [130]:
max_len = max([len(x) for x in input_sequence])

In [131]:
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [132]:
padded_input_sequenc = pad_sequences(input_sequence,maxlen=max_len,padding='pre') 

In [133]:
padded_input_sequenc

array([[  0,   0,   0, ...,   0,   1,   3],
       [  0,   0,   0, ...,   1,   3,  12],
       [  0,   0,   0, ...,   3,  12,   6],
       ...,
       [  0,   0,   0, ..., 110, 111,   4],
       [  0,   0,   0, ..., 111,   4,   1],
       [  0,   0,   0, ...,   4,   1,   3]], shape=(185, 39), dtype=int32)

In [134]:
x = padded_input_sequenc[:,:-1] 
y = padded_input_sequenc[:,-1]

In [135]:
x

array([[  0,   0,   0, ...,   0,   0,   1],
       [  0,   0,   0, ...,   0,   1,   3],
       [  0,   0,   0, ...,   1,   3,  12],
       ...,
       [  0,   0,   0, ..., 109, 110, 111],
       [  0,   0,   0, ..., 110, 111,   4],
       [  0,   0,   0, ..., 111,   4,   1]], shape=(185, 38), dtype=int32)

In [136]:
y

array([  3,  12,   6,  30,   4,  31,  32,  33,   2,  16,   1,  34,   6,
        35,   4,  17,  36,  13,  37,  38,   2,  39,  40,  18,   3,  41,
        42,  43,   2,  44,  45,  46,  19,  47,   8,  20,   4,  48,   2,
        12,  49,   7,  50,   9,  21,   2,  51,  21,  52,  53,   9,  54,
         1,   3,  55,   8,  56,   7,  18,  57,  22,  58,   2,  59,  60,
        61,  22,  62,  63,  12,   8,  64,   1,   3,  65,  23,  10,   5,
        11,  16,   2,  66,  67,  24,  14,  68,  69,  70,  25,  71,  15,
         9,  72,  73,   1,   3,  23,  74,  75,   5,  76,  77,  78,   5,
         9,  13,  11,  79,   1,   3,  80,  81,  82,  83,  25,  84,  85,
         1,   3,  26,   2,  86,   1,   3,  27,  10,   5,  17,  87,  88,
         7,   6,   1,  15,  27,  10,   5,  89,  28,  90,  91,  92,  29,
        10,   5,   6,  93,   4,  11,  94,  24,  14,  95,  28,  96,  15,
        97,  29,  98,  11,  99,   2, 100,  19, 101, 102,   2, 103,  13,
       104, 105, 106,   8,  20,   4, 107,   1,   4,   6, 109, 11

In [137]:
tokenizer.word_index

{'data': 1,
 'and': 2,
 'analysis': 3,
 'of': 4,
 'on': 5,
 'the': 6,
 'in': 7,
 'a': 8,
 'business': 9,
 'focuses': 10,
 'statistical': 11,
 'is': 12,
 'information': 13,
 'predictive': 14,
 'while': 15,
 'modeling': 16,
 'discovering': 17,
 'making': 18,
 'techniques': 19,
 'variety': 20,
 'science': 21,
 'more': 22,
 'that': 23,
 'for': 24,
 'descriptive': 25,
 'eda': 26,
 'cda': 27,
 'or': 28,
 'analytics': 29,
 'process': 30,
 'inspecting': 31,
 'cleansing': 32,
 'transforming': 33,
 'with': 34,
 'goal': 35,
 'useful': 36,
 'informing': 37,
 'conclusions': 38,
 'supporting': 39,
 'decision': 40,
 'has': 41,
 'multiple': 42,
 'facets': 43,
 'approaches': 44,
 'encompassing': 45,
 'diverse': 46,
 'under': 47,
 'names': 48,
 'used': 49,
 'different': 50,
 'social': 51,
 'domains': 52,
 "today's": 53,
 'world': 54,
 'plays': 55,
 'role': 56,
 'decisions': 57,
 'scientific': 58,
 'helping': 59,
 'businesses': 60,
 'operate': 61,
 'effectively': 62,
 'mining': 63,
 'particular': 64,
 't

In [138]:
from tensorflow.keras.utils import to_categorical 

In [139]:
y=to_categorical(y,num_classes=112)

In [140]:
y.shape

(185, 112)

In [141]:
x.shape

(185, 38)

## Model building

In [152]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [153]:
model = Sequential()
model.add(Embedding(vocab_size, 150))
model.add(LSTM(150))
model.add(Dense(num_classes, activation='softmax'))

# Build model with input shape (batch_size can be None)
model.build(input_shape=(None, input_length))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [154]:
x.shape

(185, 38)

In [155]:
y.shape

(185, 112)

In [156]:
model.fit(x,y,epochs=100)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.0216 - loss: 4.7153   
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0595 - loss: 4.6592
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.0378 - loss: 4.5254
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.0595 - loss: 4.4200
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0757 - loss: 4.3507
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0703 - loss: 4.2985
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.0811 - loss: 4.2455
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.0973 - loss: 4.1663
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7a7d61d58a50>

## Testing the model 

In [175]:
text2="analysis" 

# Tokenization
token_text =tokenizer.texts_to_sequences([text2])[0]

# Padding 
padded_text=pad_sequences([token_text],maxlen=40,padding='pre') 

model.predict(padded_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


array([[1.79884719e-05, 4.01552825e-04, 2.39735848e-04, 5.57572639e-04,
        3.93632352e-02, 2.67465394e-02, 7.10351095e-02, 6.86546904e-04,
        1.48791145e-03, 1.14229377e-02, 8.34502205e-02, 1.45120934e-01,
        1.16130523e-01, 9.42237384e-05, 8.62014640e-05, 1.87296318e-04,
        6.91210662e-05, 8.82179476e-03, 1.69715422e-04, 1.45535250e-05,
        2.68838194e-04, 9.48604647e-05, 9.78864409e-05, 1.00664317e-03,
        4.84913064e-04, 3.60708538e-04, 1.10477326e-04, 6.65752566e-04,
        4.06771396e-05, 1.38352349e-01, 1.69217389e-03, 7.12027017e-04,
        2.69648939e-04, 1.50991560e-04, 1.89956809e-05, 3.58283636e-04,
        1.33195908e-05, 1.86965081e-05, 2.82060191e-05, 1.97639911e-05,
        1.00804900e-05, 1.89996690e-01, 1.61807239e-02, 3.77207034e-04,
        5.39859284e-05, 1.46598813e-05, 1.83411466e-05, 1.02708045e-05,
        1.17546384e-04, 1.04085018e-04, 5.98273982e-05, 3.66271488e-05,
        4.83269287e-06, 9.60493013e-02, 3.29277827e-03, 5.037445

In [176]:
import numpy as np 
pos=np.argmax(model.predict(padded_text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


In [177]:
tokenizer.word_index

{'data': 1,
 'and': 2,
 'analysis': 3,
 'of': 4,
 'on': 5,
 'the': 6,
 'in': 7,
 'a': 8,
 'business': 9,
 'focuses': 10,
 'statistical': 11,
 'is': 12,
 'information': 13,
 'predictive': 14,
 'while': 15,
 'modeling': 16,
 'discovering': 17,
 'making': 18,
 'techniques': 19,
 'variety': 20,
 'science': 21,
 'more': 22,
 'that': 23,
 'for': 24,
 'descriptive': 25,
 'eda': 26,
 'cda': 27,
 'or': 28,
 'analytics': 29,
 'process': 30,
 'inspecting': 31,
 'cleansing': 32,
 'transforming': 33,
 'with': 34,
 'goal': 35,
 'useful': 36,
 'informing': 37,
 'conclusions': 38,
 'supporting': 39,
 'decision': 40,
 'has': 41,
 'multiple': 42,
 'facets': 43,
 'approaches': 44,
 'encompassing': 45,
 'diverse': 46,
 'under': 47,
 'names': 48,
 'used': 49,
 'different': 50,
 'social': 51,
 'domains': 52,
 "today's": 53,
 'world': 54,
 'plays': 55,
 'role': 56,
 'decisions': 57,
 'scientific': 58,
 'helping': 59,
 'businesses': 60,
 'operate': 61,
 'effectively': 62,
 'mining': 63,
 'particular': 64,
 't

In [178]:
for word,index in tokenizer.word_index.items(): 
    if index==pos: 
        print(word)

has
