In [2]:
import numpy as np
import re

In [3]:

data = "Information security (InfoSec) is a set of tools and practices that protect information from unauthorized access, use, or disclosure. Information retrieval is a system that helps people find data within an organization. Here's some more information about both: Information securityInfoSec protects information in a variety of forms, including digital files, paper documents, and human speech. It involves preventing unauthorized access, use, or disclosure of information, and reducing the negative impact of security incidents. InfoSec also includes documenting the processes, threats, and systems that affect information security. "
print(data)

Information security (InfoSec) is a set of tools and practices that protect information from unauthorized access, use, or disclosure. Information retrieval is a system that helps people find data within an organization. Here's some more information about both: Information securityInfoSec protects information in a variety of forms, including digital files, paper documents, and human speech. It involves preventing unauthorized access, use, or disclosure of information, and reducing the negative impact of security incidents. InfoSec also includes documenting the processes, threats, and systems that affect information security. 


In [4]:
sentences = data.split('.')
sentences

['Information security (InfoSec) is a set of tools and practices that protect information from unauthorized access, use, or disclosure',
 ' Information retrieval is a system that helps people find data within an organization',
 " Here's some more information about both: Information securityInfoSec protects information in a variety of forms, including digital files, paper documents, and human speech",
 ' It involves preventing unauthorized access, use, or disclosure of information, and reducing the negative impact of security incidents',
 ' InfoSec also includes documenting the processes, threats, and systems that affect information security',
 ' ']

In [5]:
#This code snippet performs text cleaning on a list of sentences
clean_sent=[] # inititalized an empty list: clean_sent=[] will hold the cleaned sentences.
for sentence in sentences:
    if sentence=="":
        continue
    sentence = re.sub('[^A-Za-z0-9]+', ' ', (sentence)) # uses a regular expression to replace all non-alphanumeric characters (anything other than letters and numbers) with a space.
    sentence = re.sub(r'(?:^| )\w (?:$| )', ' ', (sentence)).strip() # removes any single characters that are surrounded by spaces 
    sentence = sentence.lower()  #converts to lower case
    clean_sent.append(sentence) #adds the cleaned sentence to the clean_sent list.

print(clean_sent)

['information security infosec is a set of tools and practices that protect information from unauthorized access use or disclosure', 'information retrieval is a system that helps people find data within an organization', 'here s some more information about both information securityinfosec protects information in a variety of forms including digital files paper documents and human speech', 'it involves preventing unauthorized access use or disclosure of information and reducing the negative impact of security incidents', 'infosec also includes documenting the processes threats and systems that affect information security', '']


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_sent)
sequences = tokenizer.texts_to_sequences(clean_sent)
print(sequences)

#This code uses the Tokenizer class from TensorFlow's Keras module to convert a list of cleaned sentences into sequences of integers. 
#Each unique word in clean_sent is assigned an integer based on its frequency (more common words generally receive lower integers). 


[[1, 4, 7, 8, 5, 15, 2, 16, 3, 17, 6, 18, 1, 19, 9, 10, 11, 12, 13], [1, 20, 8, 5, 21, 6, 22, 23, 24, 25, 26, 27, 28], [29, 30, 31, 32, 1, 33, 34, 1, 35, 36, 1, 37, 5, 38, 2, 39, 40, 41, 42, 43, 44, 3, 45, 46], [47, 48, 49, 9, 10, 11, 12, 13, 2, 1, 3, 50, 14, 51, 52, 2, 4, 53], [7, 54, 55, 56, 14, 57, 58, 3, 59, 6, 60, 1, 4], []]


In [7]:
index_to_word = {}
word_to_index = {}

for i, sequence in enumerate(sequences):
#print(sequence)
    word_in_sentence = clean_sent[i].split()
#print(word_in_sentence)
    
    for j, value in enumerate(sequence):
        index_to_word[value] = word_in_sentence[j]
        word_to_index[word_in_sentence[j]] = value

print(index_to_word, "\n")
print(word_to_index)

{1: 'information', 4: 'security', 7: 'infosec', 8: 'is', 5: 'a', 15: 'set', 2: 'of', 16: 'tools', 3: 'and', 17: 'practices', 6: 'that', 18: 'protect', 19: 'from', 9: 'unauthorized', 10: 'access', 11: 'use', 12: 'or', 13: 'disclosure', 20: 'retrieval', 21: 'system', 22: 'helps', 23: 'people', 24: 'find', 25: 'data', 26: 'within', 27: 'an', 28: 'organization', 29: 'here', 30: 's', 31: 'some', 32: 'more', 33: 'about', 34: 'both', 35: 'securityinfosec', 36: 'protects', 37: 'in', 38: 'variety', 39: 'forms', 40: 'including', 41: 'digital', 42: 'files', 43: 'paper', 44: 'documents', 45: 'human', 46: 'speech', 47: 'it', 48: 'involves', 49: 'preventing', 50: 'reducing', 14: 'the', 51: 'negative', 52: 'impact', 53: 'incidents', 54: 'also', 55: 'includes', 56: 'documenting', 57: 'processes', 58: 'threats', 59: 'systems', 60: 'affect'} 

{'information': 1, 'security': 4, 'infosec': 7, 'is': 8, 'a': 5, 'set': 15, 'of': 2, 'tools': 16, 'and': 3, 'practices': 17, 'that': 6, 'protect': 18, 'from': 19,

In [8]:
#this code is preparing data for a word embedding model that learns word representations based on their context


vocab_size = len(tokenizer.word_index) + 1 #defines vocabulary size along with +1 due to padding reasons

emb_size = 10 #embedding soze which represents the number of dimensions in tbhe word embeddings
context_size = 2 #the size of the context window, indicating how many words before and after the target word to include in the context

contexts = [] #This list will store the context words for each target word.
targets = [] #This list will store each target word that the model aims to predict based on the context.


for sequence in sequences:  #Iterates over each sentence in sequences
    for i in range(context_size, len(sequence) - context_size): 
        target = sequence[i]
        context = [sequence[i - 2], sequence[i - 1], sequence[i + 1], sequence[i + 2]]
#         print(context)
        contexts.append(context)
        targets.append(target)
#Loops through each word in the sentence, starting from the position context_size and stopping context_size words before the end. 
#This ensures that there are enough words on both sides of the target word for context.

print(contexts, "\n")
print(targets)

[[1, 4, 8, 5], [4, 7, 5, 15], [7, 8, 15, 2], [8, 5, 2, 16], [5, 15, 16, 3], [15, 2, 3, 17], [2, 16, 17, 6], [16, 3, 6, 18], [3, 17, 18, 1], [17, 6, 1, 19], [6, 18, 19, 9], [18, 1, 9, 10], [1, 19, 10, 11], [19, 9, 11, 12], [9, 10, 12, 13], [1, 20, 5, 21], [20, 8, 21, 6], [8, 5, 6, 22], [5, 21, 22, 23], [21, 6, 23, 24], [6, 22, 24, 25], [22, 23, 25, 26], [23, 24, 26, 27], [24, 25, 27, 28], [29, 30, 32, 1], [30, 31, 1, 33], [31, 32, 33, 34], [32, 1, 34, 1], [1, 33, 1, 35], [33, 34, 35, 36], [34, 1, 36, 1], [1, 35, 1, 37], [35, 36, 37, 5], [36, 1, 5, 38], [1, 37, 38, 2], [37, 5, 2, 39], [5, 38, 39, 40], [38, 2, 40, 41], [2, 39, 41, 42], [39, 40, 42, 43], [40, 41, 43, 44], [41, 42, 44, 3], [42, 43, 3, 45], [43, 44, 45, 46], [47, 48, 9, 10], [48, 49, 10, 11], [49, 9, 11, 12], [9, 10, 12, 13], [10, 11, 13, 2], [11, 12, 2, 1], [12, 13, 1, 3], [13, 2, 3, 50], [2, 1, 50, 14], [1, 3, 14, 51], [3, 50, 51, 52], [50, 14, 52, 2], [14, 51, 2, 4], [51, 52, 4, 53], [7, 54, 56, 14], [54, 55, 14, 57], [55

In [9]:
#printing features with target
for i in range(5):
    words = []
    target = index_to_word.get(targets[i])
    for j in contexts[i]:
        words.append(index_to_word.get(j))
    print(words," -> ", target)

['information', 'security', 'is', 'a']  ->  infosec
['security', 'infosec', 'a', 'set']  ->  is
['infosec', 'is', 'set', 'of']  ->  a
['is', 'a', 'of', 'tools']  ->  set
['a', 'set', 'tools', 'and']  ->  of


In [10]:
# Convert the contexts and targets to numpy arrays
X = np.array(contexts) #numpy error are easy to operate in python as it is inbuild library
Y = np.array(targets)

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda 

In [12]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=2*context_size),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(vocab_size, activation='softmax')
])
#sequential model is a model that trains the neural network layer by layer



In [13]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#optimizer 

In [42]:
history = model.fit(X, Y, epochs=150)

Epoch 1/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.9886 - loss: 0.3497
Epoch 2/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9808 - loss: 0.3403  
Epoch 3/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9773 - loss: 0.3177 
Epoch 4/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9773 - loss: 0.3044
Epoch 5/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9694 - loss: 0.2989
Epoch 6/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9616 - loss: 0.3026
Epoch 7/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9808 - loss: 0.2578
Epoch 8/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9808 - loss: 0.2372
Epoch 9/150
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [15]:
#!pip install scikit-learn



In [43]:
from sklearn.decomposition import PCA

embeddings = model.get_weights()[0]

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)
#PCA is principle component analysis
#to reduce the problem of overfitting


In [51]:
print("Information security (InfoSec) is a set of tools and practices that protect information from unauthorized access, use, or disclosure. Information retrieval is a system that helps people find data within an organization. Here's some more information about both: Information securityInfoSec protects information in a variety of forms, including digital files, paper documents, and human speech. It involves preventing unauthorized access, use, or disclosure of information, and reducing the negative impact of security incidents. InfoSec also includes documenting the processes, threats, and systems that affect information security. ")

Information security (InfoSec) is a set of tools and practices that protect information from unauthorized access, use, or disclosure. Information retrieval is a system that helps people find data within an organization. Here's some more information about both: Information securityInfoSec protects information in a variety of forms, including digital files, paper documents, and human speech. It involves preventing unauthorized access, use, or disclosure of information, and reducing the negative impact of security incidents. InfoSec also includes documenting the processes, threats, and systems that affect information security. 


In [54]:
# test model: select some sentences from above paragraph
test_sentenses = [
    "information security is a",
    "from unauthorized use or",
    "system that people find",
    "infosec also includes documenting"
]

In [55]:
import numpy as np
for sent in test_sentenses:
    test_words = sent.split(" ")
    x_test =[]
    for i in test_words:
        x_test.append(word_to_index.get(i))
    x_test = np.array([x_test])
    
    pred = model.predict(x_test)
    pred = np.argmax(pred[0])
    print("making a prediction for these words ", test_words, "\nresult = ", index_to_word.get(pred),"\n")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
making a prediction for these words  ['information', 'security', 'is', 'a'] 
result =  infosec 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
making a prediction for these words  ['from', 'unauthorized', 'use', 'or'] 
result =  access 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
making a prediction for these words  ['system', 'that', 'people', 'find'] 
result =  helps 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
making a prediction for these words  ['infosec', 'also', 'includes', 'documenting'] 
result =  documenting 

