[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mayank2498/Skip-Gram-model-using-numpy/blob/master/word2vec_skip_gram.ipynb)

In [8]:
import numpy as np
import string
from nltk.corpus import stopwords 

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

class word2vec(object):
    def __init__(self):
        self.N = 10
        self.X_train = []
        self.y_train = []
        self.window_size = 2
        self.alpha = 0.001
        self.words = []
        self.word_index = {}

    def initialize(self,V,data):
        self.V = V
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i
#        self.W = np.random.randn(self.V,self.N)
#        self.W1 = np.random.randn(self.N,self.V)
    
    def feed_forward(self,X):
        self.h = np.dot(self.W.T,X).reshape(self.N,1)
        self.u = np.dot(self.W1.T,self.h)
        #print(self.u)
        self.y = softmax(self.u)  
        return self.y
        
    def backpropagate(self,x,t):
        e = self.y - np.asarray(t).reshape(self.V,1)
        # e.shape is V x 1
      
        dLdW1 = np.dot(self.h,e.T)
        X = np.array(x).reshape(self.V,1)
        dLdW = np.dot(X, np.dot(self.W1,e).T)
        self.W1 = self.W1 - self.alpha*dLdW1
        self.W = self.W - self.alpha*dLdW
        
    def train(self,epochs):
        for x in range(1,epochs):  
              
            self.loss = 0
            for j in range(len(self.X_train)):
                self.feed_forward(self.X_train[j])
                self.backpropagate(self.X_train[j],self.y_train[j])
                C = 0
                for m in range(self.V):
                    if(self.y_train[j][m]):
                        self.loss += -1*self.u[m][0]
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u)))
            print("epoch ",x, " loss = ",self.loss)
            self.alpha *= 1/( (1+self.alpha*x) )
    def predict(self,word,number_of_predictions):
        if word in self.words:
            index = self.word_index[word]
            X = [0 for i in range(self.V)]
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {}
            for i in range(self.V):
                output[prediction[i][0]] = i
            
            top_context_words = []
            for k in sorted(output,reverse=True):
                top_context_words.append(self.words[output[k]])
                if(len(top_context_words)>=number_of_predictions):
                    break
    
            return top_context_words
        else:
            print("Word not found in dicitonary")


def preprocessing(corpus):
    stop_words = set(stopwords.words('english'))    
    training_data = []
    sentences = corpus.split(".")
    for i in range(len(sentences)):
        sentences[i] = sentences[i].strip()
        sentence = sentences[i].split()
        x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
        x = [word.lower() for word in x]
        training_data.append(x)
    return training_data
    

def prepare_data_for_training(sentences,w2v):
    data = {}
    for sentence in sentences:
        for word in sentence:
            if word not in data:
                data[word] = 1
            else:
                data[word] += 1
    V = len(data)
    data = sorted(list(data.keys()))
    vocab = {}
    for i in range(len(data)):
        vocab[data[i]] = i
    
    #for i in range(len(words)):
    for sentence in sentences:
        for i in range(len(sentence)):
            center_word = [0 for x in range(V)]
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)]
            for j in range(i-w2v.window_size,i+w2v.window_size):
                if i!=j and j>=0 and j<len(sentence):
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(center_word)
            w2v.y_train.append(context)
    w2v.initialize(V,data)

    return w2v.X_train,w2v.y_train    

corpus = ""
corpus += "The World Wide Web (WWW), also called the Web, is an information space where documents and other web resources are identified by Uniform Resource Locators (URLs), interlinked by hypertext links, and accessible via the Internet.[1] English scientist Tim Berners-Lee invented the World Wide Web in 1989. He wrote the first web browser in 1990 while employed at CERN in Switzerland.[2][3] The browser was released outside CERN in 1991, first to other research institutions starting in January 1991 and to the general public on the Internet in August 1991. The World Wide Web has been central to the development of the Information Age and is the primary tool billions of people use to interact on the Internet.[4][5][6] Web pages are primarily text documents formatted and annotated with Hypertext Markup Language (HTML).[7] In addition to formatted text, web pages may contain images, video, audio, and software components that are rendered in the user's web browser as coherent pages of multimedia content. Embedded hyperlinks permit users to navigate between web pages. Multiple web pages with a common theme, a common domain name, or both, make up a website. Website content can largely be provided by the publisher, or interactively where users contribute content or the content depends upon the users or their actions. Websites may be mostly informative, primarily for entertainment, or largely for commercial, governmental, or non-governmental organisational purpose "
corpus += "The World Wide Web had a number of differences from other hypertext systems available at the time. The Web required only unidirectional links rather than bidirectional ones, making it possible for someone to link to another resource without action by the owner of that resource. It also significantly reduced the difficulty of implementing web servers and browsers (in comparison to earlier systems), but in turn presented the chronic problem of link rot. Unlike predecessors such as HyperCard, the World Wide Web was non-proprietary, making it possible to develop servers and clients independently and to add extensions without licensing restrictions. On 30 April 1993, CERN announced that the World Wide Web would be free to anyone, with no fees due.[25] Coming two months after the announcement that the server implementation of the Gopher protocol was no longer free to use, this produced a rapid shift away from Gopher and towards the Web. An early popular web browser was ViolaWWW for Unix and the X Windowing System"
corpus += "Connected by the Internet, other websites were created around the world. This motivated international standards development for protocols and formatting. Berners-Lee continued to stay involved in guiding the development of web standards, such as the markup languages to compose web pages and he advocated his vision of a Semantic Web. The World Wide Web enabled the spread of information over the Internet through an easy-to-use and flexible format. It thus played an important role in popularising use of the Internet.[29] Although the two terms are sometimes conflated in popular use, World Wide Web is not synonymous with Internet.[30] The Web is an information space containing hyperlinked documents and other resources, identified by their URIs.[31] It is implemented as both client and server software using Internet protocols such as TCP/IP and HTTP. Berners-Lee was knighted in 2004 by Queen Elizabeth II for services to the global development of the Internet"
corpus += "The terms Internet and World Wide Web are often used without much distinction. However, the two are not the same. The Internet is a global system of interconnected computer networks. In contrast, the World Wide Web is a global collection of documents and other resources, linked by hyperlinks and URIs. Web resources are usually accessed using HTTP, which is one of many Internet communication protocols"
corpus += "Viewing a web page on the World Wide Web normally begins either by typing the URL of the page into a web browser, or by following a hyperlink to that page or resource. The web browser then initiates a series of background communication messages to fetch and display the requested page. In the 1990s, using a browser to view web pages—and to move from one web page to another through hyperlinks—came to be known as 'browsing,' 'web surfing' (after channel surfing), or 'navigating the Web'. Early studies of this new behaviour investigated user patterns in using web browsers. One study, for example, found five user patterns: exploratory surfing, window surfing, evolved surfing, bounded navigation and targeted navigation"

training_data = preprocessing(corpus)
w2v = word2vec()
X,y = prepare_data_for_training(training_data,w2v)
w2v.train(3000)    

    

epoch  1  loss =  7577.906320384632
epoch  2  loss =  7568.958457229223
epoch  3  loss =  7560.0418371460355
epoch  4  loss =  7551.159433092574
epoch  5  loss =  7542.314232987322
epoch  6  loss =  7533.509243463195
epoch  7  loss =  7524.747493604065
epoch  8  loss =  7516.032038533211
epoch  9  loss =  7507.365962722985
epoch  10  loss =  7498.752382898625
epoch  11  loss =  7490.194450414115
epoch  12  loss =  7481.69535298463
epoch  13  loss =  7473.258315667007
epoch  14  loss =  7464.886600988307
epoch  15  loss =  7456.583508129067
epoch  16  loss =  7448.352371076554
epoch  17  loss =  7440.1965556700015
epoch  18  loss =  7432.119455466745
epoch  19  loss =  7424.124486366185
epoch  20  loss =  7416.215079935191
epoch  21  loss =  7408.3946753882665
epoch  22  loss =  7400.666710185774
epoch  23  loss =  7393.034609227418
epoch  24  loss =  7385.501772634034
epoch  25  loss =  7378.071562132225
epoch  26  loss =  7370.747286080481
epoch  27  loss =  7363.532183206254
epoch  2

In [24]:
w2v.predict("",5)

['world', 'pages', 'one', 'entertainment', 'unidirectional']

In [0]:
with open('singles.txt', 'r') as myfile:
    data=myfile.read().replace('\n', '')

In [38]:
data

'25 SEXY MALE, seeks attrac older single lady, for discreet encounters.35YO Security Guard, seeking lady in uniform for fun times.40 yo SINGLE DAD, sincere friendly DTE seeks r/ship with fem age open S/E44yo tall seeks working single mum or lady below 45 fship rship. Nat Open6.2 35 yr old OUTGOING M seeks fem 28-35 for o/door sports - w/e awayA professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all.ABLE young man seeks, sexy older women. Phone for fun ready to playAFFECTIONATE LADY Sought by generous guy, 40s, mutual fulfillmentARE YOU ALONE or lost in a r/ship too, with no hope in sight? Maybe we could explore new beginnings together? Im 45 Slim/Med build, GSOH, high needs and looking for someone similar. You WON

In [42]:
training_data = preprocessing(data)
w2v = word2vec()
X,y = prepare_data_for_training(training_data,w2v)
w2v.train(1000)

epoch  1  loss =  51207.225405360674
epoch  2  loss =  51132.486354802095
epoch  3  loss =  51059.602371507026
epoch  4  loss =  50988.44978188566
epoch  5  loss =  50918.91040625106
epoch  6  loss =  50850.87153722459
epoch  7  loss =  50784.22590352576
epoch  8  loss =  50718.87162724836
epoch  9  loss =  50654.71218239347
epoch  10  loss =  50591.656362043424
epoch  11  loss =  50529.618261336786
epoch  12  loss =  50468.517283313224
epoch  13  loss =  50408.27817497668
epoch  14  loss =  50348.831101532414
epoch  15  loss =  50290.11176789918
epoch  16  loss =  50232.06159821147
epoch  17  loss =  50174.62798627748
epoch  18  loss =  50117.764632688115
epoch  19  loss =  50061.431987394004
epoch  20  loss =  50005.59781962962
epoch  21  loss =  49950.23793922487
epoch  22  loss =  49895.33709304685
epoch  23  loss =  49840.89005482461
epoch  24  loss =  49786.90291152032
epoch  25  loss =  49733.39451798044
epoch  26  loss =  49680.39803467656
epoch  27  loss =  49627.96237002814
e

In [56]:
w2v.predict("sense",5)

['movies', 'humour', 'going', 'times', 'supporting']