In [64]:
import fitz
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import string
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from gensim.models import Word2Vec, FastText

In [65]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1000)>


False

### Preprocess for Skipgram and CBOW

In [66]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if not (token.isdigit() or (token[:-1].isdigit() and token[-1] == '.'))]
    lemmatizer = WordNetLemmatizer()
    pos_tags = pos_tag(tokens)

    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag)) for token, pos_tag in pos_tags]

    lemmatized_tokens = [token for token in lemmatized_tokens if token] 
    
    return lemmatized_tokens

In [67]:
def preprocess_alt_text(text):

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = []

    for sentence in text:
        if sentence:
            s = ''
            for word, pos_tag in nltk.pos_tag(word_tokenize(sentence)):
                if not any(char.isdigit() for char in word) and word.lower() not in stop_words:
                    word_without_punct = ''.join(char for char in word if char not in string.punctuation)
                    pos_tag = get_wordnet_pos(pos_tag)
                    lemma = lemmatizer.lemmatize(word_without_punct, pos=pos_tag)
                    s += lemma.lower() + ' '
            tokens.append(s.strip())  
        
    return tokens

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  
    elif treebank_tag.startswith('V'):
        return 'v'  
    elif treebank_tag.startswith('N'):
        return 'n' 
    elif treebank_tag.startswith('R'):
        return 'r'  
    else:
        return 'n'

In [68]:
def process_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    preprocessed_text_corpus = []
    altt_corpus=[]
    alt_corpus=[]
    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number]
        page_text = page.get_text()

        sentences = nltk.sent_tokenize(page_text)
        alt_corpus=preprocess_alt_text(sentences)
        altt_corpus+=alt_corpus
        preprocessed_page_text = []
        for sentence in sentences:
            preprocessed_sentence = preprocess_text(sentence)
            preprocessed_page_text.append((sentence, preprocessed_sentence))
        preprocessed_text_corpus.append(preprocessed_page_text)
    pdf_document.close()
    return preprocessed_sentence,preprocessed_text_corpus,altt_corpus

In [69]:
pdf_path = r"/Users/idealguy/Downloads/fdi.pdf"

preprocessed_sentence,preprocessed_text_corpus,altt_corpus = process_pdf(pdf_path)
corpus=[]
corpuss=[]

for page_corpus in preprocessed_text_corpus:
    for sentence, tokens in page_corpus:
        corpuss.append(( tokens, sentence)) 
        corpus.append(tokens)

### CBOW


In [70]:
from gensim.models import Word2Vec
model_cbow = Word2Vec(corpus, min_count=1, vector_size=60, window=2, sg=0)
model_cbow.train(corpus, total_examples=len(corpus), epochs=250)

(551697, 716750)

In [71]:
len(corpus)

215

In [72]:
word_vectors_cbow = model_cbow.wv

In [73]:
for word in word_vectors_cbow.index_to_key:
    print(f"Word: {word}")
    print(f"Vector: {word_vectors_cbow[word]}")
    print()

Word: investment
Vector: [-0.4871151   0.07271659  0.188748    0.7203594  -0.6643704  -0.27473113
  0.0078851   1.5190917  -0.5553389   0.22840527  0.67559403  0.59770054
 -0.6221148   0.22544119 -0.45695764  0.3556585   0.40601516  0.27492583
 -0.69609326 -1.0923051  -0.15724169  1.3736537   1.2477167   0.8449092
 -0.3906803   0.09588549 -0.11016608  0.20365141 -0.48531002  0.2818122
 -0.07939587  0.08508082  2.1874106  -0.95943123 -0.0497775   0.70168275
  0.68608713 -0.3763331   0.0143463  -0.16277339 -1.500122   -0.40558654
 -1.3937125  -0.689061    0.4724043  -0.3930512  -1.584945    0.7319834
  0.04223853  1.012714   -0.58224005 -0.244933   -0.66204405  0.09595431
  1.0342098   1.1498212   0.6182733  -0.10792306 -0.5688735   0.41721877]

Word: direct
Vector: [ 0.02501251 -0.31676182 -0.3463965  -0.44669297 -0.6366472  -0.33529568
 -0.18020715  1.1377362  -0.34217578  0.24814752  0.6576975   0.56460583
 -0.43271744 -0.6106909  -0.65328276  0.4665098   0.74455404  0.47304505
 -0.26

In [74]:
similarity = word_vectors_cbow.similarity('note', 'deal')
print(f"Similarity between word1 and word2: {similarity}")

Similarity between word1 and word2: 0.8830834031105042


### Skipgram

In [75]:
model_skip = Word2Vec(corpus, min_count=1, vector_size=60, window=2, sg=1)
model_skip.train(corpus, total_examples=len(corpus), epochs=200)

(441248, 573400)

In [76]:
word_vectors_skip = model_skip.wv

In [77]:
for word in word_vectors_skip.index_to_key:
    print(f"Word: {word}")
    print(f"Vector: {word_vectors_skip[word]}")
    print()

Word: investment
Vector: [ 0.17061973  0.5146419  -0.2438879   0.51802945 -0.6273962  -0.47912252
 -0.02877015  0.6179146  -0.25543547 -0.39212164  0.2915303   0.27118942
 -0.22299276  0.1946531   0.00361227  0.0169816   0.4246313   0.47334936
 -0.62561804 -0.7318009   0.04115974  1.0808787   0.47427657  0.541854
  0.02760559 -0.20991081 -0.37246487  0.63777256 -0.82629234  0.35331175
 -0.34397516  0.02336343  1.6915233  -0.5032076   0.40335333  1.0028546
  0.16448952  0.01598084 -0.35381892 -0.2801803  -1.1771437  -0.16419403
 -0.7220155  -1.0736638   0.38575804 -0.5325383  -0.30684558  0.377469
 -0.1102868   0.8877666  -0.01352727  0.20995615 -0.09182211  0.10190029
  0.5735927   0.5065068   0.41371733 -0.15423548 -0.2942811   0.10120895]

Word: direct
Vector: [ 0.3930781   0.29768524 -0.49980554 -0.10170605 -0.9032988  -0.15871702
  0.19947504  0.6260326  -0.18399747  0.30547154  0.09804441  0.03105796
 -0.02874016 -0.39938942 -0.29634568  0.15434134  0.28143352  0.5991803
 -0.37792

In [78]:
similarity = word_vectors_skip.similarity('note', 'deal')
print(f"Similarity between word1 and word2: {similarity}")

Similarity between word1 and word2: 0.8520674705505371


### GloVe

In [79]:
model_glo = Word2Vec(sentences=corpus, min_count=1, vector_size=60, window=2, sg=0)
model_glo.train(corpus, total_examples=len(corpus), epochs=250)

word_vectors = model_glo.wv
for word in word_vectors.index_to_key:
    print(f"Word: {word}")
    print(f"Vector: {word_vectors[word]}")
    print()

Word: investment
Vector: [-0.4871151   0.07271659  0.188748    0.7203594  -0.6643704  -0.27473113
  0.0078851   1.5190917  -0.5553389   0.22840527  0.67559403  0.59770054
 -0.6221148   0.22544119 -0.45695764  0.3556585   0.40601516  0.27492583
 -0.69609326 -1.0923051  -0.15724169  1.3736537   1.2477167   0.8449092
 -0.3906803   0.09588549 -0.11016608  0.20365141 -0.48531002  0.2818122
 -0.07939587  0.08508082  2.1874106  -0.95943123 -0.0497775   0.70168275
  0.68608713 -0.3763331   0.0143463  -0.16277339 -1.500122   -0.40558654
 -1.3937125  -0.689061    0.4724043  -0.3930512  -1.584945    0.7319834
  0.04223853  1.012714   -0.58224005 -0.244933   -0.66204405  0.09595431
  1.0342098   1.1498212   0.6182733  -0.10792306 -0.5688735   0.41721877]

Word: direct
Vector: [ 0.02501251 -0.31676182 -0.3463965  -0.44669297 -0.6366472  -0.33529568
 -0.18020715  1.1377362  -0.34217578  0.24814752  0.6576975   0.56460583
 -0.43271744 -0.6106909  -0.65328276  0.4665098   0.74455404  0.47304505
 -0.26

In [None]:
similarity = word_vectors.similarity('note', 'deal')
print(f"Similarity between word1 and word2: {similarity}")

Similarity between word1 and word2: 0.8830834031105042


### Fast Text

In [None]:
from gensim.models import FastText

model_fast = FastText(sentences=corpus, vector_size=60, window=5, min_count=1, sg=1)
model_fast.train(corpus, total_examples=len(corpus), epochs=250)

word_vectors = model_fast.wv
for word in word_vectors.index_to_key:
    print(f"Word: {word}")
    print(f"Vector: {word_vectors[word]}")

Word: investment
Vector: [-0.38069248  0.11473774  0.8190267  -0.76016676 -0.23514317 -0.24373621
  0.04396214 -0.22949241 -0.08332752 -0.6904003   0.42091712  0.04840703
 -0.20687343  0.56211555  0.347473    0.12687002 -0.98962337  0.21369432
  0.71458876 -0.5986594   0.13329895 -0.22314256  0.3579797   0.87714076
  0.00167103  0.27921    -0.7199165   0.17378198 -0.141579   -0.9021931
 -0.896085   -0.00660871  0.20729978 -0.55766565  0.7437954   0.1390683
 -0.11031242  0.80188644 -0.7414581   0.23351489 -0.3425589   0.13837625
  0.70610905 -0.12097549 -0.49095687  0.4243122  -0.64255637 -0.5634142
 -0.2516871   0.8631385  -0.77727026  0.10530487  0.10396681 -0.26804653
  0.04281136 -0.80709594  1.0240457   0.25382727  0.6516516  -0.17203948]
Word: direct
Vector: [-0.35301626  0.39597484  0.33278868 -0.3482655   0.06974836 -0.19181173
 -0.33865148 -0.23215419  0.12628302 -0.28075624  0.66787356 -0.01762622
 -0.11902406 -0.01679794  0.46568388 -0.17089324 -1.0649737   0.37440753
  0.690

In [None]:
similarity = word_vectors.similarity('note', 'deal')
print(f"Similarity between 'note' and 'deal': {similarity}")

Similarity between 'note' and 'deal': 0.7717303037643433


### Sentence similarity

In [None]:
def get_sentence_embedding(sentence_tokens, model):
    word_embeddings = []
    for token in sentence_tokens:
        if token in model.wv.key_to_index:  
            word_embeddings.append(model.wv[token])
    
    if len(word_embeddings) == 0:
        return None
    
    sentence_embedding = sum(word_embeddings) / len(word_embeddings)
    return sentence_embedding

In [None]:
sentence_embedding = get_sentence_embedding(preprocessed_sentence, model_skip)
print("Sentence Embedding:", sentence_embedding)

Sentence Embedding: [ 0.40762883 -0.15597929  0.80260754  0.16695146 -0.21771152 -0.46769175
  0.6026646   0.33482438 -0.28496212  0.80141515 -0.25593016 -0.28069454
  0.1860431  -0.12451565  0.21450529 -0.24899702 -0.15297778 -0.64488375
 -1.0431945  -0.815211    0.4146617  -1.2086818   0.08410405  0.6714705
  0.02241136  0.41277534 -0.2623279  -0.04053234  0.07768077 -0.25163448
 -0.18481678 -0.7915076  -0.39863616 -0.47554484  0.6384628   0.31476656
  0.0222251  -0.55002344 -0.9996363   0.3885551   0.23648617 -0.4032835
 -0.95043737 -0.02273306  0.16250905 -0.03549254 -0.6491085   0.13873385
  0.317847    0.11031945 -0.44728383  0.1318047   0.46156254  0.43429965
  0.06544609  0.06763045  0.28932047 -1.2407743   1.1748817  -0.9512168 ]


In [None]:
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

### Finding most Relevant Sentence for Word2Vec Model

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_most_relevant_sentences_using_word_embeddings(question, corpus, model, top_n=3):
    preprocessed_question = preprocess_text(question)
    question_embedding = np.zeros((1, model.vector_size))  
    count = 0 

    
    for token in preprocessed_question:
        if token in model.wv.key_to_index:
            question_embedding += model.wv[token]
            count += 1

    if count == 0:
        return "Unable to find relevant sentences."

    question_embedding /= count  

    top_sentences = []
    for sentence_tokens, original_sentence in corpus:
        sentence_embedding = np.zeros((1, model.vector_size)) 
        count = 0  

        
        for token in sentence_tokens:
            if token in model.wv.key_to_index:
                sentence_embedding += model.wv[token]
                count += 1

        if count > 0:
            sentence_embedding /= count  
            similarity = cosine_similarity(question_embedding, sentence_embedding)
            top_sentences.append((original_sentence, similarity))

    top_sentences.sort(key=lambda x: x[1], reverse=True)
    top_sentences = top_sentences[:top_n]

    return top_sentences


### Finding most Relevant Sentence for BERT

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

BERT_Model = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(BERT_Model)
model_bert = AutoModel.from_pretrained(BERT_Model)

def sent_embedding(sent):
    tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                    padding='max_length', return_tensors='pt')
    with torch.no_grad():
        outputs = model_bert(**tokens)
        embedding = outputs.pooler_output.detach().numpy()
    return embedding

def calculate_similarity(sent_embedding1, sent_embedding2):
    sent_embedding1 = torch.tensor(sent_embedding1)
    sent_embedding2 = torch.tensor(sent_embedding2)
    return torch.nn.functional.cosine_similarity(sent_embedding1, sent_embedding2).item()

def find_most_relevant_sentences_using_bert(question, corpus, model, top_n=3):
    question_embedding = sent_embedding(question)

    top_sentences = []
    for sentence_tokens, original_sentence in corpus:
        sentence = ' '.join(sentence_tokens)
        sentence_embedding = sent_embedding(sentence)
        similarity = calculate_similarity(question_embedding, sentence_embedding)
        top_sentences.append((original_sentence, similarity))

    top_sentences.sort(key=lambda x: x[1], reverse=True)
    top_sentences = top_sentences[:top_n]

    return top_sentences


In [None]:
user_question = input("Enter your question: ")
a=input('Choose Model:\\n1)CBOW\n2)Skip Gram\n3)Glove\n4)FastText\n5)BERT\n')
if a=='1':
    print('CBOW')
    top_relevant_sentences = find_most_relevant_sentences_using_word_embeddings(user_question, corpuss, model_cbow, top_n=3)
elif a=='2':
    print('Skip Gram')
    top_relevant_sentences = find_most_relevant_sentences_using_word_embeddings(user_question, corpuss, model_skip, top_n=3)
elif a=='3':
    print('Glove')
    top_relevant_sentences = find_most_relevant_sentences_using_word_embeddings(user_question, corpuss, model_glo, top_n=3)
elif a=='4':
    print('Fast Text')
    top_relevant_sentences = find_most_relevant_sentences_using_word_embeddings(user_question, corpuss, model_fast, top_n=3)
elif a=='5':
    print('BERT')
    top_relevant_sentences = find_most_relevant_sentences_using_bert(user_question, corpuss, model_bert, top_n=3)
else:
    print("Invalid Choice")

def capitalize_first_letter(sentence):
    if sentence:
        return sentence[0].upper() + sentence[1:]
    return ""    
    
required_words = ["In addition", "Moreover"]
required_punctuation = "."
modified_content = ""

if not top_relevant_sentences:
    modified_content = "Unable to find relevant sentences."
elif top_relevant_sentences== "Unable to find relevant sentences.":
    modified_content = "Unable to find relevant sentences."
else:
    for i, (sentence, _) in enumerate(top_relevant_sentences):
        sentence = sentence.replace('\n', '')
        modified_sentence = capitalize_first_letter(' '.join(sentence.split()))
        if(modified_sentence[-1]=='.'):
            modified_content += modified_sentence 
        else:
            modified_content += modified_sentence+required_punctuation
        if i < len(required_words) and len(top_relevant_sentences) > i:  
            modified_content += ' ' + required_words[i]+' '

print("\nModified Content:")
print(modified_content)


Fast Text

Modified Content:
FDI data Data on FDI flows and stocks are offered by several sources, the most important of which are explained in the Annex. In addition The main statistical sources for FDI are reviewed and the discrepancies are shown for total inward FDI flows and stocks both for emerging and industrial countries. Moreover There are also tables of ratios of FDI flows to Gross Capital Formation and of FDI stocks to GDP.
