In [1]:

from nltk.corpus import reuters
import re
import spacy
import requests
from bs4 import BeautifulSoup
import re
import random
from spacy.training.example import Example
from spacy import displacy
from spacy.language import Language
import json
import os
from tqdm import tqdm
import numpy as np


## Scrape data set Class

In [2]:
class Scrapping_data:

    def __init__(self):
        pass

    def scrape_prodructs(self):

        #scrapping the oil-based products
        _products = []

        r= requests.get('https://innovativewealth.com/inflation-monitor/what-products-made-from-petroleum-outside-of-gasoline/')
        soup = BeautifulSoup(r.text,'html.parser')
        for i in soup.find_all('td'):
            _products.append(i.string)

        
        #hardcoded products
        _products.append('kerosine')
        _products.append('crude oil')
        _products.append('fuel')
        

        #getting the none type out of the list
        _products = filter(None, _products)

        #converting the elements in list to lowercase
        _products = [p.lower() for p in _products]
        
        print(f'products amount= {_products}')
        #saving the data as csv format
        with open('scrapped_oil_products.json','w',encoding='utf-8') as f:
            json.dump(_products,f)
        
        return 'scrapped_oil_products.json'

    def scrape_oil_companies(self):
        #scrapping 220 oil companies
        _com = []
        #page 1 and 3 
        for i in range(1,4):
            url = f'https://companiesmarketcap.com/oil-gas/largest-oil-and-gas-companies-by-market-cap/?page={i}'
            
            r= requests.get(url)
            soup = BeautifulSoup(r.text,'html.parser')
            for j in soup.find_all('div',{'class':'company-name'}):
                _com.append(j.string)

        #lowercase the company names
        _com = [c.lower() for c in _com]
        _com = [c.replace('\n','') for c in _com]

        print(f'companies amount= {len(_com)}')
        #save the data as csv format
        with open('scrapped_oil_companies.json','w',encoding='utf-8') as f:
            json.dump(_com,f)

        return 'scrapped_oil_companies.json'
        
  

In [None]:
class Scrape_wiki_data(object):
    '''
    class to scrape data from wikipedia
    '''

    def __init__(self):
        '''
        initialize the class
        1: get all the links in the table of that wikipedia page
        2: get only the oil company links
        3: get all the cleaned paragraphs of every page
        4: save the data in a json file
        '''
        self.all_links = self.get_links()
        self.oil_company_links = self.get_oil_company_links(self.all_links)
        self._all_praragraphs = self.get_paragraphs(self.oil_company_links)
        self.save_json(self._all_praragraphs)
    
    def get_links(self) ->list:
        '''
        This function is going to scrape the links from the wikitable
        :return: a list of links of the oil companies and their respectives countries
        '''
        #hardcoded url
        wiki_url = 'https://en.wikipedia.org/wiki/List_of_largest_oil_and_gas_companies_by_revenue'
        print('Fetching main wiki article: %s' % wiki_url)

        r = requests.get(wiki_url)
        print('Done. Extracting table links..')
        #getting the text of the htlmpage using bs4
        soup = BeautifulSoup(r.text,'html.parser')
        table = soup.find('table', 'wikitable')
        td = table.findAll('td')
        #is going to give 1 single link per <td> tag
        links_with_nations = []
        for t in td:
            #filtering the Null values
            if t.a is not None:
                links_with_nations.append(t.a.get('href'))
        
        return links_with_nations

    def get_oil_company_links(self,links_with_nations: list)-> list:
        '''
        This function is going to filter the oil companies from the countries
        :return: a list of links with all the oil companies
        '''
        #save the indexes of the non-url content
        index = []
        for i in range(len(links_with_nations)):
            if links_with_nations[i][0:6] != '/wiki/':
                index.append(i)
        #remove the non-url content
        for i in index:
            links_with_nations.pop(i)

        #initializen a list of links
        links_of_oil_companies = []
        
        #due the structure of wikipedia, we only need the second link of the total links. 
        # The first columns are countries and the second are the companies
        for i in range(len(links_with_nations)):
            if i % 2 !=0 :
                links_of_oil_companies.append(links_with_nations[i])

        return links_of_oil_companies
    
    def get_paragraphs(self, links_of_oil_companies):
        '''
        This function is going to parse the content of every link into paragraphs
        :return: a list of paragraphs(chunck of text)
        '''
        #initilazing an array to save the wiki content
        all_sentences = []
        #looping through all the links
        for link in tqdm(links_of_oil_companies):
            url = 'https://en.wikipedia.org'+link
            r = requests.get(url)
            soup = BeautifulSoup(r.text,'html.parser')
            for p in tqdm(soup.find_all('p')):
                txt= p.get_text()
                all_sentences.append(self.clean(txt))

    
        _all_paragraphs = [i for i in all_sentences if i]


        for i in range(len(_all_paragraphs)):
            for j in range(len(_all_paragraphs[i])):
                if _all_paragraphs[i][j] == '' or  _all_paragraphs[i][j] == ' ' or  _all_paragraphs[i][j] == None or   _all_paragraphs[i][j] == []:
                    _all_paragraphs[i].pop(j)

        _all_paragraphs = [i for i in _all_paragraphs if i]
        
        return _all_paragraphs

    #taken from https://github.com/blueprints-for-text-analytics-python/blueprints-text/blob/master/ch12/Knowledge_Graph.ipynb
    @staticmethod
    def clean(text):
        text = text.replace('&lt;','<') # html escape
        text = re.sub(r'[<>]', '"', text) # quotation marks instead of <>
        text = re.sub(r'[ ]*"[A-Z\.]+"', '', text) # drop stock symbols
        text = re.sub(r'[ ]*\([A-Z\.]+\)', '', text) # drop stock symbols
        text = re.sub(r'\bdlr(s?)\b', r'dollar\1', text, flags=re.I)
        text = re.sub(r'\bmln(s?)\b', r'million\1', text, flags=re.I)
        text = re.sub(r'\bpct\b', r'%', text, flags=re.I)
        text = re.sub(r'"', r'', text) # quotation marks
        text = re.sub(r'\s+', ' ', text) # multiple whitespace by one
        text = re.sub('[()]', '', text)
        pattern = r'\[.*?\]'
        text = re.sub(pattern, '', text)
        text = text.lower()
        
        
        arr = []
        arr = text.split('. ')

        return arr

    def save_json(self, text):
        with open('wiki_data.json','w',encoding='utf-8') as f:
            json.dump(text,f)
        
        

## Creating the training dataset

In [21]:
class Training_data(object):
    global nlp
    global _com
    global _products 
    
    #initialising the code, needs to recive te nlp object 
    def __init__(self, nlp: Language):
        #check if file exists
        if not os.path.isfile('wiki_data.json'):
            self.wiki = Scrape_wiki_data()
            self.wikidata = self.load_json('wiki_data.json')
        else:
            self.wikidata = self.load_json('wiki_data.json')

        self.nlp = nlp
        self.scrape = Scrapping_data()
        
        self._products = self.load_json(self.scrape.scrape_prodructs())
        self._com =  self.load_json(self.scrape.scrape_oil_companies())
        
        self.other_cate_text = self.making_sentences()
        print(f"amount of text in article: {len(self.other_cate_text)}")
        
        #getting the training data
        for i in range(len(self.wikidata)):
            for j in range(len(self.wikidata[i])):
                self.other_cate_text.append(self.wikidata[i][j])
        print(f"amount of text in article + wiki : {len(self.other_cate_text)}")
        
        
        self._other_cat_training_data = []    

        #looping through all the paragraphs
        for i in range(len(self.other_cate_text)):
            #looping through all the sentences
            for j in range(len(self.other_cate_text[i])):
                #checking if sentences have the product name in them
            
                txt = str(self.other_cate_text[i][j])
                #if they have the product name, we take the sentence and append it to the training data
                self._other_cat_training_data.append(self.parse_train_data_products(txt))

        #looping through all the paragraphs
        for i in range(len(self.other_cate_text)):
            #looping through all the sentences
            for j in range(len(self.other_cate_text[i])):
                #checking if sentences have the product name in them
                txt = str(self.other_cate_text[i][j])
                #if they have the product name, we take the sentence and append it to the training data
                self._other_cat_training_data.append(self.parse_train_data_company(txt))
              
    
        print(f'amount of datapoints: {len(self._other_cat_training_data)}')
        self.save_data(self._other_cat_training_data)
       
    
    #save the data
    def save_data(self,data):
        with open('trainig_data.json','w',encoding='utf-8') as f:
            json.dump(data,f)

  
        
    #making the training sentences out of the categories given in the list
    def making_sentences(self):
        #load nlp object
        interesting_categories = ['crude','castor-oil','gas','fuel','nat-gas','oil']

        other_cate_text = []
        for i in interesting_categories:
            reuters_fileids_crudes = reuters.fileids(categories=[i])
            #load raw content of the dataset
            ruw = [reuters.raw(i) for i in reuters_fileids_crudes]

            #splitting everything in sentences(split if there is a . and space)-> ex: 'the end. ' and cleaning the raw text.  
            
            for line in ruw:
                other_cate_text.append(Scrape_wiki_data.clean(line))

        return other_cate_text

    #parsing the text into a training format
    def parse_train_data_products(self,txt):
        """
        loops through the product list. if it finds a product in the list that's also in the text, then it'll look for the first char position and the last one. 
        Then it'll asign a PRODUCT entitie to it.
        result and structure example -> ('gasoline supplies would also be limited', {'entities': [(0, 8, 'PRODUCT')]})
        at char index 0 until 8 we have the word 'gasoline' wich is in the product list.
        """
        arr = []
        for pro in self._products:
            if pro in txt:
                start_char = txt.find(pro)
                end_char = txt.find(pro)+len(pro)
                productt = [(start_char,end_char,'PRODUCT')]
                return(txt,{'entities':productt})
        

    def parse_train_data_company(self,txt):
        arr = []
        for com in self._com:
            if com in txt:
                start_char = txt.find(com)
                end_char = txt.find(com)+len(com)
                companies = [(start_char,end_char,'COMPANY')]
                return(txt,{'entities':companies})
        
    

    #load the json from the scraped class
    def load_json(self,file):
        with open(file,'r',encoding='utf-8')as f:
            data = json.load(f)
        return(data)

## TRAINING LOOP


In [25]:
class Train_model(object):
    global nlp
    global training_data

    #initialising the 
    def __init__(self, nlp: Language, training_data:list):
        self.nlp = nlp
        self.training_data= training_data
        
    
        
    def create_blank_nlp(self):
   
        #create blank nlp
       
        #create blank ner pipe
        ner = self.nlp.create_pipe('ner')
        #attach ner pipeline to nlp opbject as the last pipe
        self.nlp.add_pipe('ner',last=True)
        
        ner = self.nlp.get_pipe('ner')
        
        #loop through the training data
        for _, annotations in self.training_data:

            #get the entities and add the label name to the created pipeline
            for ent in annotations.get('entities'):
                ner.add_label(ent[2])#ent[2] -> 'PRODUCT'or 'COMPANY'



        #neural network
        optimizer = self.nlp.begin_training()
        #30 iterations, this number is chosen randonly by me
        for i in range(25):
            #every iteration we'll shuffle the training data
            random.shuffle(self.training_data)
            losses ={}
            #
            for text, annotations in self.training_data:
                
                
                doc = self.nlp.make_doc(text)
                
                ex = Example.from_dict(doc,annotations)
                self.nlp.update([ex],sgd=optimizer, losses=losses)
            print(f'iteration {i} - {losses}')
        return self.nlp

## Main method where code is initialized

In [None]:
def main():
    
    nlp = spacy.blank('en')
    #check if data is already is path
    if not os.path.isfile('trainig_data.json'):
        training_data = Training_data(nlp)
        data = training_data
    
    #load data if it is already in path
    else:
        with open('trainig_data.json','r',encoding='utf-8')as f:
            data= json.load(f)
        print('kleir')
    

    train_model = Train_model(nlp,data)
    model = train_model.create_blank_nlp()
    model.to_disk('random')

if __name__ == '__main__':
    main()


In [27]:
from spacy import displacy
nlp = spacy.load('en_core_web_lg')
_nlp = spacy.load('custom_nlp')
doc1 = nlp('so far this year, distillate demand fell 2.3 % to 3.20 million royal shell inc from 3.28 million in 1986, gasoline demand was 6.63 million bp, off 0.3 % from 6.65 million, and residual fuel demand fell 4.9 % to 1.35 million bpd from 1.42 million, the eia said')
doc = _nlp('so far this year, distillate demand fell 2.3 % to 3.20 million royal shell inc from 3.28 million in 1986, gasoline demand was 6.63 million bp, off 0.3 % from 6.65 million, and crude oil demand fell 4.9 % to 1.35 million bpd from 1.42 million, the eia said')
displacy.render(doc,style='ent')
displacy.render(doc1,style='ent')

In [160]:
txt = "so far this year, distillate demand fell 2.3 % to 3.20 million royal shell inc from 3.28 million in 1986, gasoline demand was 6.63 million bp, off 0.3 % from 6.65 million, and crude oil demand fell 4.9 % to 1.35 million bpd from 1.42 million, the eia said"
string = 'crude oil'
print(txt.find(string),txt.find(string)+len(string))


176 185
