### Importing important libraries

In [2]:
from bs4 import BeautifulSoup
import urllib3
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import os

### Wiki list of American actors

In [3]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL = 'https://en.wikipedia.org'
MAIN_URL = BASE_URL+ '/wiki/Petroleum'
total_added = 0

In [4]:
def get_soup(url):
    http = urllib3.PoolManager()
    r = http.request("GET", url)
    return BeautifulSoup(r.data,'lxml')

### Text preprocessing - stop word & citation removals

In [25]:
# Function returns the negation handled word if it is presend in the appos dictionary
# Else returns the word itself
def negationHandling(word):
    if word in appos:
        return appos[word]
    else:
        return word
    
# Check if a word is a Stopword
# Stopword is a word that is commonly present in most of the documents and does not affect the model
def isNotStopWord(word):
    return word not in stopwords.words('english')


def preprocessingText(text):
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    sentences = nltk.sent_tokenize(text)
    tokens = []
    temp = ""
    
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        
        #Converting to LowerCase
#         words = map(str.lower, words)
        
        # Remove stop words
        words = filter(lambda x: isNotStopWord(x), words)
        
        # Removing punctuations except '<.>/<?>/<!>'
        punctuations = '"#$%&\'()*+,-/:;<=>@\\^_`{|}~'
        words = map(lambda x: x.translate(str.maketrans('', '', punctuations)), words)
        
        # Remove empty strings
        words = filter(lambda x: len(x) > 0, words)
      
        tokens = tokens + list(words)
        temp = ' '.join(word for word in tokens)
        
    return temp

### Parsing each linked petroleum webpage and writing into a text file

In [26]:
# write the given content into text file with name <title>.txt
def write_text_into_file(title, data, modified_data):
    os.chdir(r'/home/nb01/C_Drive/Knowledge_Graph_Creation/Dataset/PetroleumDataset')
    filename = title + ".txt"
    f = open(filename, 'w+', encoding="utf-8")
    f.write(data)
    f.close()
    
    os.chdir(r'/home/nb01/C_Drive/Knowledge_Graph_Creation/Dataset/PetroleumPreprocessedDataset')
    filename = title + ".txt"
    f = open(filename, 'w+', encoding="utf-8")
    f.write(modified_data)
    f.close()
    #with open(filename, 'r') as f:
    #    print(f)
    #    k = f.read()
    #    print(k)
    print("Text file " + filename + " created")

In [27]:
# Parse each petrolium linked webpage content
def parse_webpage_content(link):
    soup = get_soup(link)
    #results = soup.find_all("div", {"class": "mw-parser-output"})[0]
    no_of_paragraphs = 0
    paragraphs = soup.find_all('p')
    #print(paragraphs)
    data = ""
    for para in paragraphs:
        #if para.id != "mw-empty-elt":
        data += para.text.strip() +"\n"
        #print(data)
        #no_of_paragraphs += 1
        #if no_of_paragraphs == 3:
        #    break
            
    #extracting 2 sentences from the paragraph
    #data = ".".join(data.split(".")[:2])
    modified_data = preprocessingText(data+".") 
    return data, modified_data

### Parsing all petroleum linked webpage content

In [28]:
# iterate through every group
def parse_all_petroleums_from_wiki(url):
    #name = 'petroleum'
    #data = parse_webpage_content(url) #Getting main page data
    #write_text_into_file(name, data) #Writting main page data
    
    #Processing the Pages that link to "Petroleum"
    soup = get_soup(url)
    #Getting link for petroleum linked url
    linked_url = soup.find_all("div", {"class":"portal", "id":"p-tb"})[0].find_next('ul').find_all('li')[0].a['href'].strip()
    #print(BASE_URL + linked_url)
    soup = get_soup(BASE_URL + linked_url)
    #print(soup)
    results = soup.find_all("div", {"id":"mw-content-text"})
    #print(results)
    no_of_petroleum_links = 0
    for res in results:
        # iterator through every actor or <li> element
        li_list = res.find_next('ul').find_all('li')
        #print(li_list)
        for li in li_list:
            name = li.a.text.strip()
            #print(name)
            link = li.a['href'].strip()
            #print(link)
            data, modified_data = parse_webpage_content(BASE_URL+link)
            write_text_into_file(name, data, modified_data)
            no_of_petroleum_links += 1
            print(no_of_petroleum_links)
            if no_of_petroleum_links == 2:
                break
    print(no_of_petroleum_links)

In [29]:
def main():
    parse_all_petroleums_from_wiki(MAIN_URL)

In [30]:
if __name__ == "__main__":
    main()

Text file Alaska.txt created
1
Text file Alkane.txt created
2
2
