In [1]:
# We'll the steps as before, 1) divide the corpus into sentences 2) tokenize the sentences 3) get_vocab to make the one_hot encoding
# add starts and ends

# First problem
#   The model is seeing certain words more than others (The, in, at, on, and)
#   There's no order, so the model will find it impossible to find next word, the model know which word he has but he doesn't know the order

In [2]:
import wikipedia
import math
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F


In [3]:

topics = [
    "Artificial Intelligence",
    "Climate Change",
    "Quantum Computing",
    "World War II",
    "Ancient Egypt",
    "Space Exploration",
    "Global Health",
    "Economics",
    "Philosophy of Science",
    "Modern Art",
    "Genetics",
    "Renewable Energy",
    "Cybersecurity",
    "Cryptocurrency",
    "Social Media",
    "Cultural Anthropology",
    "Astrophysics",
    "Human Rights",
    "Machine Learning",
    "History of Technology",
    "Biotechnology"
]

In [27]:
def divide_corpus(corpus):
    sentences=corpus.split('.')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    return sentences
def tokenize_sentences(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokenized = sentence.split(' ')
        final_tokenized=[]
        for word in tokenized:
            if len(word)>0:
                if (word[-1]>='a' and word[-1]<='z') or word[-1]=='>':
                    final_tokenized.append(word)
                else:
                    final_tokenized.append(word[:-1])
        tokenized_sentences.append(final_tokenized)
        
    
    return tokenized_sentences

def get_vocabulary(data,min_freq=1):
    word_counts={}  #keys:word and value:count
    vocab={}
    for sentence in data:
        for word in sentence:
            if word in word_counts.keys():
                word_counts[word]+=1
            else:
                word_counts[word]=1
    i=0
    for key,value in word_counts.items():
        if value>=min_freq:
            vocab[key]=i
            i+=1
    return vocab
def add_starts_ends(data,n=1):
    final_data=[]
    for sentence in data:
        final_data.append(['<s>'] * n + sentence + ['<e>'])
    return final_data
def suggest_word(sentence,lm):
    max_prob=-10000
    suggested_word=''
    sentence_log_prob=sum([np.log(lm[(sentence[i],sentence[i+1])]) for i in range(len(sentence)-1) if (sentence[i],sentence[i+1]) in lm.keys()])
    for key,value in lm.items():
        if key[0]==sentence[len(sentence)-1]:
            if np.log(value)+sentence_log_prob>max_prob:
                max_prob=np.log(value)+sentence_log_prob
                suggested_word=key[1]
    return suggested_word

def one_hot_encode(word,vocab):
    word_encoded=np.zeros(len(vocab))
    word_encoded[vocab[word]]=1
    return(word_encoded)
def get_input(sentence,i,vocab):
    sentence_encoded=np.zeros(len(vocab))
    for j in range(i-1,i):
        sentence_encoded+=one_hot_encode(sentence[j],vocab)
    return(sentence_encoded/(j+1))
def prepare_train_data(data,vocab):
    y=[]
    x=[]
    for sentence in data:
        for i in range(1,len(sentence)):
            y.append(one_hot_encode(sentence[i],vocab))
            x.append(get_input(sentence,i,vocab))
    return np.array(x),np.array(y)
def get_random_batch(data,BATCH_SIZE):
    batch=[]
    for _ in range(BATCH_SIZE):
        batch.append(random.choice(data))
    return batch

In [5]:
corpus=''
for topic in topics:
    try:
        page = wikipedia.page(topic)
        corpus+=page.content
        print(topic)
    except:
        print('skipped:',topic)
        continue

Artificial Intelligence
Climate Change
Quantum Computing
World War II
Ancient Egypt
skipped: Space Exploration
Global Health
Economics
Philosophy of Science
Modern Art




  lis = BeautifulSoup(html).find_all('li')


skipped: Genetics
Renewable Energy
Cybersecurity
Cryptocurrency
Social Media
Cultural Anthropology
Astrophysics
Human Rights
skipped: Machine Learning
History of Technology
Biotechnology


In [32]:
data=divide_corpus(corpus)
data=tokenize_sentences(data)
data=add_starts_ends(data)
vocab=get_vocabulary(data,1)
print(len(vocab))

18384


In [33]:
device="cuda"
class LLM(nn.Module):
    def __init__(self,in_features):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 1000),
            nn.ReLU(),
            nn.Linear(1000, in_features),
            nn.Softmax()
        )
    def forward(self,x):
        x=x.to(device)
        return self.net(x)
in_features=len(vocab)
model=LLM(in_features).to(device)

In [34]:
EPOCHS=50000
BATCH_SIZE=32
best_accuracy=0
crit=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(), lr=1e-2,weight_decay=5e-4)

for epoch in range(EPOCHS):
    correct = 0
    batch=get_random_batch(data,BATCH_SIZE)
    x,y=prepare_train_data(batch,vocab)
    x=torch.from_numpy(x).float().to(device)
    y=torch.from_numpy(y).float().to(device)
    #forward
    y_hat=model(x).to(device)
    loss=crit(y,y_hat)
    #backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    _, predicted = torch.max(y_hat, 1)
    y_indices = torch.argmax(y, dim=1)
    correct = (predicted == y_indices).sum().item()
    if epoch%100==0:
        print(epoch)
        print(correct/len(x))
    
    # accuracy = correct / total
    # if accuracy>=best_accuracy:
    #     best_accuracy=accuracy
    # print(epoch)
    # print('accuracy: ',accuracy)
    # print('best_accuracy: ',best_accuracy)
    del x
    del y
    del batch
    torch.cuda.empty_cache()

0
0.0
100
0.05217391304347826
200
0.06756756756756757
300
0.03128911138923655
400
0.04040404040404041
500
0.05473372781065089
600
0.05071315372424723
700
0.06311360448807854
800
0.043184885290148446
900
0.041720990873533245
1000
0.06967213114754098
1100
0.005008347245409015
1200
0.0440771349862259
1300
0.059654631083202514
1400
0.04644412191582003
1500
0.020348837209302327
1600
0.04923076923076923
1700
0.05860805860805861
1800
0.051209103840682786
1900
0.06002728512960437
2000
0.06044905008635579
2100
0.05443234836702955
2200
0.04664723032069971
2300
0.052795031055900624
2400
0.036741214057507986


KeyboardInterrupt: 

In [38]:
my_sentence='how are'
my_sentence=divide_corpus(my_sentence)
my_sentence=tokenize_sentences(my_sentence)
my_sentence=add_starts_ends(my_sentence)

x,y=prepare_train_data(my_sentence,vocab)
x=x[len(x)-1]
x=torch.from_numpy(x).float().to(device)


In [39]:
y_hat=model(x).to(device)
# _, predicted = torch.max(y_hat, 1)
# y_indices = torch.argmax(y, dim=1)
# y_indices
max_index = y_hat.argmax()

print(max_index)

tensor(15, device='cuda:0')


In [37]:
for key,value in vocab.items():
    if value==15:
        print(key)

<e>
