### Word2vec model word2vec.py

In [6]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F
import lightning as L

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
class Word2vec(L.LightningModule):
    
    def __init__(self,vocab_size , embedding_dim , learning_rate = 0.01):
        
        super(Word2vec,self).__init__()
        self.embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.linear = nn.Linear(embedding_dim,vocab_size)
        
        self.learning_rate = learning_rate

    def forward(self,inputs):
        embeds = self.embeddings(inputs)
        output = self.linear(embeds)
        
        log_probablity = F.log_softmax(output,dim=1)
        return log_probablity    
    
    def training_steps(self,batch,batch_idx):
        inputs , targets = batch
        
        logs_prob = self(inputs)
        loss = F.nll_loss(logs_prob,targets)
        self.log('training loss ' , loss)
        return loss 
    
    
    def configure_optimizers(self):
        optimiser = torch.optim.adam(self.parameters(),lr = self.learning_rate)
        return optimiser
    
    def get_word_embedding(self):
        return self.embeddings.weight.data
    


## Playing with Dataset.py

In [18]:
import torch.nn as nn
import lightning as L
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np 
import re # this is for regular expression 
from typing import List, Tuple


### Creating Corpus of text 

In [8]:
class Corpus:
    def __init__(self,file_path:str):
        self.file_path = file_path
        self.words = self.read_corpus()
        self.word_counts = Counter(self.words)
        self.vocab = self.build_vocab()
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab)} # dictionary mapping from word to inext 
        self.idx2word = {idx: word for word, idx in self.word2idx.items()} # dictionary mapping index to word for retreival when training testing 
        
        
    def read_corpus(self) -> List[str]:
        with open(self.file_path,'r',encoding= 'utf-7') as f:
            return [word.lower() for line in f for word in re.findall(r'\w+', line) ]
        
    def build_vocab(self,min_count: int = 5) -> List[str]:
        return [word for word, count in self.word_counts.items() if count >= min_count]
    
    

In [11]:
class Word2vecDataset(Dataset):
    def __init__(self,corpus: Corpus , window_size: int=5):
        self.corpus  = corpus
        self.window_size = window_size
        self.data = self.create_dataset() # we will define this function later on 
        
    def create_dataset(self) -> List[tuple[int,int]]:
        data = [] # define the vector that contains the dataset 
        for i , target in enumerate(self.corpus.words):
            target_idx = self.corpus.word2idx.get(target)
            if target_idx is None:
                continue
            context_words = self.corpus.words[max(0, i - self.window_size):i] + \
                            self.corpus.words[i + 1:i + 1 + self.window_size]
            for context in context_words:
                context_idx = self.corpus.word2idx.get(context)
                if context_idx is not None:
                    data.append((target_idx, context_idx))
        return data
    
    def __len__(self): # Returns the total number of samples
        return len(self.data)
    
    def __getitem__(self, idx): # returns the returns a specific sample in pytorch
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1]) # returns two rensors with index 0 and 1

In [16]:


class Word2vecDataModule(L.LightningDataModule):
    def __init__(self, file_path: str, batch_size : int = 64):
        super().__init__()
        self.file_path = file_path
        self.batch_size = batch_size
        
    def setup(self, stage= None): # initializes the necessary components for training a Word2Vec model using text data from a specified file path
        corpus = Corpus(self.file_path)
        self.datataset = Word2vecDataset(corpus)
        self.vocab_size = len(corpus.vocab)
        
        
    def train_dataloader(self) :
        return DataLoader(self.dataset, batch_size=self.batch_size,shuffle=True,num_workers=4)
    

## Main model skipgram and others 

In [21]:
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as L

In [22]:
# skip gram model nn.Module :


class SkipGram(nn.Module):
    
    def __init__(self,vocab_size: int, embedding_dim : int):
        super(SkipGram,self).__init__()
        self.embeding = nn.Embedding(vocab_size,embedding_dim)
        self.output = nn.Linear(embedding_dim,vocab_size)
        
    def forward (self,inputs):
        embeds = self.embeding(inputs)
        output = self.output(embeds)
        return output

In [None]:
class Word2VecLightning(L.LightningModule):
    
    def __init__():

In [2]:
from fastapi import FastAPI


ModuleNotFoundError: No module named 'fastapi'

In [7]:
import fastapi
from fastapi import FastAPI

from src.training.train import train_word2vec

app = FastAPI()

@app.post("/train")
async def train_model(file_path: str, embedding_dim:int = 300 , batch_size : int = 64,max_epochs : int = 5):
    model,data_module = train_word2vec(file_path,embedding_dim,batch_size,max_epochs)
    return  {"message": "model Trained Succesfully"}



ImportError: cannot import name 'Word2VecLightningModule' from 'src.model.word2vec' (/home/parthshr370/Downloads/Pytorch Practice/word2vec/src/model/word2vec.py)