<a href="https://colab.research.google.com/github/patrickerson/corpus/blob/main/corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Patrickerson dos Santos Veiga

Classe para modelagem dos dados que serão usados para realizar o web scrap. Logo abaixo, as instâncias com o nome do site, url, tag e  classe do conteúdo, respectivamente.

In [None]:
class WebscrappingModal:

    def __init__(self, name, url, content_tag, content_class):
        self.name=name
        self.url=url
        self.content_tag=content_tag
        self.content_class=content_class

    def append_df(self, df):
        self.df = df

    
    def set_sents_array(self, sents_array):
        self.sents_array = sents_array

In [None]:
Abs = WebscrappingModal(
    name="abs",
    url="https://www.abs.gov.au/websitedbs/D3310114.nsf/home/Basic+Survey+Design+-+Data+Processing",
    content_tag='div',
    content_class="content"
)

In [None]:
Helpscout = WebscrappingModal(
    name="helpscout",
    url="https://www.helpscout.com/company/legal/dpa/",
    content_tag='div',
    content_class="Contentstyles__ContentDIV-sc-7tdxle-0 dzbACc"
)

In [None]:
Integrate = WebscrappingModal(
    name="integrate",
    url="https://www.integrate.io/blog/the-5-types-of-data-processing/",
    content_tag='article',
    content_class="container-fluid integrateio-blog-post-content"
)

In [None]:
Peda = WebscrappingModal(
    name="peda",
    url="https://peda.net/kenya/ass/subjects2/computer-studies/form-3/data-processing",
    content_tag='article',
    content_class="textmodule document uuid-199b3e82-3256-11e7-bd46-d102fbf45fbc enclose"
)

In [None]:
Simplilearn = WebscrappingModal(
    name="simplilearn",
    url="https://www.simplilearn.com/what-is-data-processing-article",
    content_tag='article',
    content_class="desig_author empty-text"
)

Classe para junção das models com a controller (Scrapper), contentando os conteúdos que serão analisados posteriormente

In [None]:
class Middleware:
    
    contents = [
       
        Simplilearn,
        Integrate,
         Peda,
        Helpscout,
        Abs
    ]

In [None]:
from bs4 import BeautifulSoup
from requests import get
import spacy
import pandas as pd
from spacy.language import Language
class Scrapper:
    nlp=spacy.load("en_core_web_sm")
    contents = Middleware.contents
    encode = 'utf-8'

    def __init__(self):
      self.nlp.add_pipe("set_start_setence", before="parser")


    def load_contents(self):
        """
        Chama as models do contents para carregar o conteúdo
        """
        for content in self.contents:
            
            self.load_content(content)

    @Language.component("set_start_setence")
    def set_start_setence(doc):
      """
      Adiciona novos inicios de sentenças
      """
      custom_sent_start = ["!", "?", ",", ";", ".","\n"]
      for token in doc[:-1]:
          if token.text in custom_sent_start:
              doc[token.i+1].is_sent_start = True
      return doc

    def load_content(self, content):
        """
        Carrega o conteúdo dado uma WebscrappingModal

        Parameters
        ----------
        content : WebscrappingModal
          Conteúdo a ser carregado

        """
        c = Content(content)
        text = c.get_text()
        
        doc = self.nlp(text.strip())
        sents_array = [sent.text for sent in doc.sents]
        void_arg = lambda arg: arg != "" and arg != " " and arg != "\n" and arg!= ","
        content.set_sents_array(list(filter(void_arg, map(lambda x: x.replace("\n", ""),sents_array))))


    def view_sents_array(self):
      """
      Função para exibição dos arrays de sentença de cada conteúdo
      """
      for i in self.contents:
         print(i.sents_array)
          
    def get_sets_arrays(self):
      return [sents_array.sents_array for sents_array in self.contents]

In [None]:
class Content:
  parser="html.parser"
  def __init__(self, content):
     """
        Carrega o conteúdo dado uma WebscrappingModal

        Parameters
        ----------
        content : WebscrappingModal
          Conteúdo a ser carregado

        
      """
     self.content = content

  def get_text(self):
        """
        Obtem um texto do conteúdo de uma url e realiza web scrap


        Nesta classe, utilizando um método get em uma URL previamente definida
        no content. É definido o encode default desta classe.

        Para o webscrap, procura-se pela tag e classe definida respectivamente
        no content

        Quaisquer que sejam o conteúdo com a tag script, é extraido do conteúdo,
        dessa forma, scripts não são lidos pela classe.


        Returns
        --------
        String
          Texto extraido
        """
        
        try:
          with open(self.content.name + ".html","r", encoding='utf-8') as file:
            html_text = file.read()
        except FileNotFoundError:
          self.save_html()
          with open(self.content.name + ".html","r", encoding='utf-8') as file:
            html_text = file.read()
        soup = BeautifulSoup(html_text, self.parser)
        for s in soup.select('script'):
          s.extract()
        
        if self.content.content_class=="":
            find = soup.find(self.content.content_tag)
            print(find)
            return find.text.replace.replace("\xa0", " ")
            
        else:
            find = soup.find(self.content.content_tag, class_=self.content.content_class)
            if find == None:
              print(self.content.name)
            return find.text.replace("\xa0", " ")
      
  def save_html(self):
      """
      Salva os textos requeridos através de um request em arquivos 
      .html para facilitar o desenvolvimento e evitar bloqueios de firewall

      """
      html_text = get(self.content.url)
      html_text.encoding='utf-8'
      with open(self.content.name + ".html", "w",encoding='utf-8') as file:
        file.write(html_text.text)

In [None]:
scrapper = Scrapper()


In [None]:
scrapper.load_contents()

In [None]:
scrapper.view_sents_array()

['Whether you use the internet to learn about a certain topic,', 'complete financial transactions online,', 'order food,', 'etc.', 'data is being generated every single second.', 'The use of social media,', 'online shopping and video streaming services have all added to the increase in the amount of data.', 'A study by Domo estimates that 1.7MB data is created every second for every human being on the planet in 2020.', 'And in order to utilize and get insights from such a huge amount of data - data processing comes into play.', 'Moving forward,', 'let us understand what is data processing.', 'What Is Data Processing?', 'Data in its raw form is not useful to any organization.', 'Data processing is the method of collecting raw data and translating it into usable information.', 'It is usually performed in a step-by-step process by a team of data scientists and data engineers in an organization.', 'The raw data is collected,', 'filtered,', 'sorted,', 'processed,', 'analyzed,', 'stored,', '

# Segunda entrega
-----------

In [None]:
nlp=spacy.load("en_core_web_sm")
def lemmatize(text):
  """

  Transforma uma sentença de palavras em uma sequência de lexemas.

  Parameters
  ----------
  text : String
    texto a ser transformado em sequência de lexemas
  Returns
  --------
  Lista(String)
    Lista de lexemas


  """
  doc = nlp(text)
  return [word.lemma_ for word in doc if not word.is_punct and not word.text in ["!", "?", ",", ";", ".","\n"]]


Transforma os arrays de sentenças em um único array

In [None]:
sents_array = []
for array in range(len(scrapper.get_sets_arrays())):
  for sents in scrapper.get_sets_arrays()[array]:
    sents_array.append(sents)

instância um dataframe com o array de sequência

In [None]:

data = {'sents': sents_array}
index = [sent for sent in range(len(sents_array))]
df = pd.DataFrame(data=data, index=index)

Aplica a função lemmatize para a coluna sents, criando um array de lexemas para cada sentença

In [None]:
tokens_lemmatize = df.sents.apply(lambda txt : lemmatize(txt))
df['tokens']=tokens_lemmatize

Cria uma coluna para o lexema no dataframe com todos os valores da célula em 0

In [None]:
for i in tokens_lemmatize:
  for word in i:
    if type(df.get(word))!=pd.core.series.Series:
      df[word]=0

  after removing the cwd from sys.path.


Adiciona +1 em cada ocorrência do lexama na sentença

In [None]:
count=0
for i in tokens_lemmatize:
  for word in i:
    df.at[count,word]+=1
  count+=1

In [None]:
bow = df.drop(columns=["tokens", "sents"])

In [None]:
bow

Unnamed: 0,whether,you,use,the,internet,to,learn,about,a,certain,...,proportion,proportional,500,580,20,loan,housing,lending,probably,average
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1938,0,0,0,1,0,0,0,0,2,1,...,1,0,0,0,0,0,0,0,0,0
1939,0,0,0,2,0,2,0,0,2,0,...,0,1,1,1,0,0,0,0,0,0
1940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1941,0,0,0,5,0,3,0,0,1,0,...,0,0,1,1,1,0,0,0,0,0
