In [1]:
# We'll the steps as before, 1) divide the corpus into sentences 2) tokenize the sentences 3) get_vocab to make the one_hot encoding
# add starts and ends

In [2]:
import wikipedia
import math
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F


In [3]:

topics = [
    "Artificial Intelligence",
    "Climate Change",
    "Quantum Computing",
    "World War II",
    "Ancient Egypt",
    "Space Exploration",
    "Global Health",
    "Economics",
    "Philosophy of Science",
    "Modern Art",
    "Genetics",
    "Renewable Energy",
    "Cybersecurity",
    "Cryptocurrency",
    "Social Media",
    "Cultural Anthropology",
    "Astrophysics",
    "Human Rights",
    "Machine Learning",
    "History of Technology",
    "Biotechnology"
]

In [4]:
def divide_corpus(corpus):
    sentences=corpus.split('.')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    return sentences
def tokenize_sentences(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokenized = sentence.split(' ')
        final_tokenized=[]
        for word in tokenized:
            if len(word)>0:
                if (word[-1]>='a' and word[-1]<='z') or word[-1]=='>':
                    final_tokenized.append(word)
                else:
                    final_tokenized.append(word[:-1])
        tokenized_sentences.append(final_tokenized)
        
    
    return tokenized_sentences

def get_vocabulary(data,min_freq=1):
    word_counts={}  #keys:word and value:count
    vocab={}
    for sentence in data:
        for word in sentence:
            if word in word_counts.keys():
                word_counts[word]+=1
            else:
                word_counts[word]=1
    i=0
    for key,value in word_counts.items():
        if value>=min_freq:
            vocab[key]=i
            i+=1
    return vocab
def add_starts_ends(data,n=1):
    final_data=[]
    for sentence in data:
        final_data.append(['<s>'] * n + sentence + ['<e>'])
    return final_data
def suggest_word(sentence,lm):
    max_prob=-10000
    suggested_word=''
    sentence_log_prob=sum([np.log(lm[(sentence[i],sentence[i+1])]) for i in range(len(sentence)-1) if (sentence[i],sentence[i+1]) in lm.keys()])
    for key,value in lm.items():
        if key[0]==sentence[len(sentence)-1]:
            if np.log(value)+sentence_log_prob>max_prob:
                max_prob=np.log(value)+sentence_log_prob
                suggested_word=key[1]
    return suggested_word

def one_hot_encode(word,vocab):
    word_encoded=np.zeros(len(vocab))
    word_encoded[vocab[word]]=1
    return(word_encoded)
def get_input(sentence,i,vocab):
    sentence_encoded=np.zeros(len(vocab))
    for j in range(0,i):
        sentence_encoded+=one_hot_encode(sentence[j],vocab)
    return(sentence_encoded/(j+1))

def prepare_train_data(data,vocab):
    y=[]
    x=[]
    for sentence in data:
        for i in range(1,len(sentence)):
            y.append(one_hot_encode(sentence[i],vocab))
            x.append(get_input(sentence,i,vocab))
    return np.array(x),np.array(y)

In [5]:
corpus=''
for topic in topics:
    try:
        page = wikipedia.page(topic)
        corpus+=page.content
        print(topic)
    except:
        print('skipped:',topic)
        continue

Artificial Intelligence
Climate Change
Quantum Computing
World War II
Ancient Egypt
skipped: Space Exploration
Global Health
Economics
Philosophy of Science
Modern Art




  lis = BeautifulSoup(html).find_all('li')


skipped: Genetics
Renewable Energy
Cybersecurity
Cryptocurrency
Social Media
Cultural Anthropology
Astrophysics
Human Rights
skipped: Machine Learning
History of Technology
Biotechnology


In [6]:
data=divide_corpus(corpus)
data=tokenize_sentences(data)
data=add_starts_ends(data)
vocab=get_vocabulary(data,1)
print(len(vocab))

18384


In [19]:
device="cuda"
class LLM(nn.Module):
    def __init__(self,in_features):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.ReLU(),
            nn.Linear(64, in_features),
            nn.Softmax()
        )
    def forward(self,x):
        x=x.to(device)
        return self.net(x)
in_features=len(vocab)
model=LLM(in_features).to(device)

In [20]:
EPOCHS=200
BATCH_SIZE=32
best_accuracy=0
crit=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(), lr=1e-2,weight_decay=5e-4)

for epoch in range(EPOCHS):
    correct = 0
    total = 0
    for i in range(0,len(data),BATCH_SIZE):
        x,y=prepare_train_data(data[i:i+BATCH_SIZE],vocab)
        x=torch.from_numpy(x).float().to(device)
        y=torch.from_numpy(y).float().to(device)
        #forward
        y_hat=model(x).to(device)
        loss=crit(y,y_hat)
        #backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(y_hat, 1)
        y_indices = torch.argmax(y, dim=1)
        correct = (predicted == y_indices).sum().item()
        total+= len(x)
        print(correct/len(x))
    
    accuracy = correct / total
    if accuracy>=best_accuracy:
        best_accuracy=accuracy
    print(epoch)
    print('accuracy: ',accuracy)
    print('best_accuracy: ',best_accuracy)
    del x
    del y

  return self._call_impl(*args, **kwargs)


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003105590062111801
0.0015337423312883436
0.0
0.02735562310030395
0.03183023872679045
0.047674418604651166
0.028622540250447227
0.04054054054054054
0.023709902370990237
0.018617021276595744
0.0017064846416382253
0.0
0.0
0.003552397868561279
0.005847953216374269
0.015105740181268883
0.01615508885298869
0.016442451420029897
0.012280701754385965
0.009174311926605505
0.0064516129032258064
0.009259259259259259
0.013975155279503106
0.017241379310344827
0.012987012987012988
0.005190311418685121
0.0029542097488921715
0.0014814814814814814
0.0
0.034371643394199784
0.025559105431309903
0.03965303593556382
0.041025641025641026
0.034594594594594595
0.03975155279503106
0.04341926729986431
0.03902439024390244
0.26229508196721313
0.04519774011299435
0.041025641025641026
0.03864734299516908
0.03902439024390244
0.036488027366020526
0.09220779220779221
0.08172362555720654
0.09066305818673884
0.07692307692307693
0.09603841536614646
0.06823821339950373
0.10979228486646884


KeyboardInterrupt: 

In [57]:
my_sentence='when'
my_sentence=divide_corpus(my_sentence)
my_sentence=tokenize_sentences(my_sentence)
my_sentence=add_starts_ends(my_sentence)

x,y=prepare_train_data(my_sentence,vocab)
x=x[len(x)-1]
x=torch.from_numpy(x).float().to(device)


In [58]:
y_hat=model(x).to(device)
# _, predicted = torch.max(y_hat, 1)
# y_indices = torch.argmax(y, dim=1)
# y_indices
max_index = y_hat.argmax()

print(max_index)

tensor(122, device='cuda:0')


In [49]:
for key,value in vocab.items():
    if value==122:
        print(key)

the
