#Web Scraping


In [105]:
import requests
import json
from bs4 import BeautifulSoup
import numpy as np
from collections import Counter
from tensorflow.data import Dataset 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, GRU
from tensorflow.keras.losses import sparse_categorical_crossentropy

**It takes at least 1M characters to train a model based on RNN ~ 5000 comments** 

In [2]:
page_url = 'https://www.pudelek.pl/bal-tvn-2022-czy-to-xena-wojownicza-ksiezniczka-nie-to-tylko-malgorzata-rozenek-w-zlotej-zbroi-zdjecia-6813461672434400a/'
page = requests.get(page_url)

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')
#print(soup.prettify())

In [4]:
clas = "sc-1mskw74-0 sc-7eqdwf-0 sc-q1w81m-0 cyNurK"
comm = soup.find_all("div", {"class": clas})
print(comm[4].get_text())
print(len(comm))

Bardziej przypomina kostium syreny...
19


**A HTML file does not contain all comments, padding is inteactive and does not chcenge the url adress, so there is a need to use API through GraphQL.**

In [46]:
query = """query comments(
    $productId: String!
      $materialId: String!
        $parentId: String
          $offset: Int = 0
            $first: Int = 10
              $sort: CommentsSortType = LATEST

                  ) {
                      comments: comments(
                            productId: $productId
                                contentId: $materialId
                                    parentId: $parentId
                                        offset: $offset
                                            first: $first
                                                sort: $sort
                                                  ) {
                                                        ...commentEdges
                                                            pageInfo {
                                                                    hasNextPage
                                                                          hasPreviousPage
                                                                                endCursor
                                                                                      totalCount
                                                                                              }
                                                                                                 }

                                                                                                      }
                                                                                                                                  
                                                                                                   fragment commentEdges on CommentsConnection {
                                                                                                         edges {
                                                                                                             node {
                                                                                                                      text
                                                                                                                          nick
                                                                                                                               id
                                                                                                                                   replies
                                                                                                                                       }
                                                                                                                                          }
                                                                                                                                             }"""
#query return max 100 comments
main_comm = []
for off in range(0, 2500, 100):
  data = {"query": query, "variables": {"productId": "5980322589471361", "materialId": "6813461672434400", "offset": off, "first": 100, "sort": "LATEST"}}
  response = requests.post('https://data-api.wp.pl/graphql', json=data)
  main_comm.append(response.json())
#json.dumps(response.json(), indent=2, ensure_ascii=False)
#print(str(main_comm[1])[-500:])

In [47]:
#Dict to str
comments=''
for mc in main_comm:
    for i in range(0, len(mc['data']['comments']['edges'])):
      a = mc['data']['comments']['edges'][i]['node']

      text = str(a).split("{")[1].split(", \'nick\'")[0] + '\n'
      nick = '\'nick\'' + str(a).split("{")[1].split(", \'nick\'")[1].split(', \'id\'')[0] + '\n' + '\n'

      comments = comments + text + nick

In [None]:
#print(comments[:500])

In [24]:
#number of main comments
comments.count('nick')

1642

In [25]:
len(comments)

186625

**Getting replies to main comments**

In [48]:
#getting id when replies > 0
parentId = [] 
for mc in main_comm:
  for i in range(0, len(mc['data']['comments']['edges'])):
    if mc['data']['comments']['edges'][i]['node']['replies']>0:
      parentId.append(mc['data']['comments']['edges'][i]['node']['id'])

In [49]:
flag = 0
for id in parentId:
  data = {"query": query, "variables": {"productId": "5980322589471361", "materialId": "6813461672434400", "parentId": id, "offset": 0, "first": 100, "sort": "LATEST"}}
  response_rep = requests.post('https://data-api.wp.pl/graphql', json=data)
  replies = response_rep.json()

  for i in range(0, len(replies['data']['comments']['edges'])):
    a = replies['data']['comments']['edges'][i]['node']

    text = str(a).split("{")[1].split(", \'nick\'")[0] + '\n'
    nick = '\'nick\'' + str(a).split("{")[1].split(", \'nick\'")[1].split(', \'id\'')[0] + '\n' + '\n'

    comments = comments + text + nick

    if a['replies']>100:
      flag += 1

In [50]:
#another round, if any main comment had more than 100 replies
if flag > 0:
  for id in parentId:
    data = {"query": query, "variables": {"productId": "5980322589471361", "materialId": "6813461672434400", "parentId": id, "offset": 100, "first": 100, "sort": "LATEST"}}
    response_rep = requests.post('https://data-api.wp.pl/graphql', json=data)
    replies = response_rep.json()

    for i in range(0, len(replies['data']['comments']['edges'])):
      a = replies['data']['comments']['edges'][i]['node']

      text = str(a).split("{")[1].split(", \'nick\'")[0] + '\n'
      nick = '\'nick\'' + str(a).split("{")[1].split(", \'nick\'")[1].split(', \'id\'')[0] + '\n' + '\n'

      comments = comments + text + nick

In [29]:
len(comments)

255364

In [33]:
print(comments[-500:])

g przydałby się pani Gosi....'
'nick': 'Bbhjj'

'text': 'tyle łapek w górę , tylko za same emotikonki , to chyba kupione!!!'
'nick': 'jak nic!😂😁'

'text': 'Bez tej zbroi byłoby znośnie..ale ona chyba lubi wyglądać przaśnie'
'nick': 'Meh'

'text': 'Obstawiam że przegrała zaklad'
'nick': 'Kasia'

'text': 'Do omg. \nZa co lubisz „Małgosię”? \nTak pytam z ciekawości. '
'nick': 'Lola'

'text': 'lubię bardzo Małgosię ale ktoś jej zrobił mega psikusa, chyba ze to był bal przebierańców '
'nick': 'omg'




In [30]:
#number of comments
comments.count('nick')

2169

**One article did not provide sufficient number of characters to the neural network, so scraping several other articles on the same topic needs to be done.**



1.   `materialId`: 6812649644006144: https://www.pudelek.pl/widzowie-surowo-oceniaja-kolejny-wystep-malgorzaty-rozenek-w-ddtvn-mniej-o-sobie-bo-nas-pani-zameczy-6812649644006144a
2.   `materialId`: 6810881828096608: https://www.pudelek.pl/malgorzata-rozenek-na-galowo-pozuje-po-wyjsciu-z-dzien-dobry-tvn-6810881828096608a
3. `materialId`: 6810847091022464: https://www.pudelek.pl/malgorzata-rozenek-zaliczyla-wpadke-w-dzien-dobry-tvn-internauci-bezlitosnie-komentuja-nie-sprawdza-sie-w-roli-prowadzacej-6810847091022464a
4. `materialId`: 6806626165414656: https://www.pudelek.pl/wzruszona-malgorzata-rozenek-chlipie-po-debiucie-w-dzien-dobry-tvn-zdjecia-6806626165414656g



In [51]:
material_ids = ['6812649644006144', '6810881828096608', '6810847091022464', '6806626165414656']

In [56]:
for material_id in material_ids:
  main_comm = []
  for off in range(0, 2500, 100):
    data = {"query": query, "variables": {"productId": "5980322589471361", "materialId": material_id, "offset": off, "first": 100, "sort": "LATEST"}}
    response = requests.post('https://data-api.wp.pl/graphql', json=data)
    main_comm.append(response.json())

  for mc in main_comm:
      for i in range(0, len(mc['data']['comments']['edges'])):
        a = mc['data']['comments']['edges'][i]['node']

        text = str(a).split("{")[1].split(", \'nick\'")[0] + '\n'
        nick = '\'nick\'' + str(a).split("{")[1].split(", \'nick\'")[1].split(', \'id\'')[0] + '\n' + '\n'

        comments = comments + text + nick

  parentId = [] 
  for mc in main_comm:
    for i in range(0, len(mc['data']['comments']['edges'])):
      if mc['data']['comments']['edges'][i]['node']['replies']>0:
        parentId.append(mc['data']['comments']['edges'][i]['node']['id'])

  flag = 0
  for id in parentId:
    data = {"query": query, "variables": {"productId": "5980322589471361", "materialId": material_id, "parentId": id, "offset": 0, "first": 100, "sort": "LATEST"}}
    response_rep = requests.post('https://data-api.wp.pl/graphql', json=data)
    replies = response_rep.json()

    for i in range(0, len(replies['data']['comments']['edges'])):
      a = replies['data']['comments']['edges'][i]['node']

      text = str(a).split("{")[1].split(", \'nick\'")[0] + '\n'
      nick = '\'nick\'' + str(a).split("{")[1].split(", \'nick\'")[1].split(', \'id\'')[0] + '\n' + '\n'

      comments = comments + text + nick

      if a['replies']>100:
        flag += 1

  if flag > 0:
    for id in parentId:
      data = {"query": query, "variables": {"productId": "5980322589471361", "materialId": material_id, "parentId": id, "offset": 100, "first": 100, "sort": "LATEST"}}
      response_rep = requests.post('https://data-api.wp.pl/graphql', json=data)
      replies = response_rep.json()

      for i in range(0, len(replies['data']['comments']['edges'])):
        a = replies['data']['comments']['edges'][i]['node']

        text = str(a).split("{")[1].split(", \'nick\'")[0] + '\n'
        nick = '\'nick\'' + str(a).split("{")[1].split(", \'nick\'")[1].split(', \'id\'')[0] + '\n' + '\n'

        comments = comments + text + nick

In [57]:
#number of comments
comments.count('nick')

7877

In [58]:
len(comments)

1145077

In [63]:
print(comments[-500:])

ie rozumie jak to jest - nie musimy rozumieć czemu ktoś marzył akurat o tym. '
'nick': 'Karina'

'text': 'Jakby Tobie wpadło tyle hajsu na konto za nic też byś płakała.Ja płaczę bo mi szybciej waluta wyjeżdża niż wjeżdża.'
'nick': 'Jus'

'text': 'Dla niej to praca. Człowiek stresuje się, gdy w pracy ma robić coś nowego, a często gdy stres minie, to człowiek musi się wypłakać. Nie wiem, na bezrobociu siedzisz, że cię to dziwi, że ktoś może się z pracy cieszyć czy nią stresować?'
'nick': 'michu'




In [60]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/Colab Notebooks/

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [61]:
# text_file = open("komentarze.txt", "w")
# n = text_file.write(comments)
# text_file.close()

#RNN for text generating

In [None]:
# comments = open("komentarze.txt", "r").read()

##Text processing

In [108]:
# The unique characters in the file
chars = sorted(set(comments))
print(chars)
chars_num = len(chars)
print(chars_num)

['\n', ' ', '!', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '«', '·', '»', '¿', 'Ó', 'à', 'ä', 'ç', 'ì', 'ñ', 'ò', 'ó', 'õ', 'ö', 'ý', 'Ą', 'ą', 'Ć', 'ć', 'Ę', 'ę', 'ķ', 'ĺ', 'Ł', 'ł', 'Ń', 'ń', 'Ś', 'ś', 'Ź', 'ź', 'Ż', 'ż', 'ž', 'π', '–', '—', '’', '‚', '“', '”', '„', '…', '⁉', '●', '☀', '☆', '☹', '☺', '♀', '♂', '♠', '♥', '⛔', '✌', '❌', '❓', '❗', '❤', '⭐', '️', '🅰', '🇧', '🇬', '🌈', '🌷', '🌸', '🌺', '🌻', '🌼', '🍊', '🍌', '🍑', '🍷', '🍸', '🍹', '🍼', '🏇', '🏻', '🏼', '🏿', '🐩', '🐷', '👀', '👋', '👌', '👍', '👎', '👏', '👶', '👹', '👻', '👽', '💁', '💃', '💐', '💓', '💕', '💚', '💦', '💧', '💩', '💪', '💫', '💮', '💰', '📚', '🕳', '🕺', '😀', '😁

In [68]:
dictionary = {u:i for i, u in enumerate(chars)}

In [71]:
reverse_dictionary = np.array(chars)

In [72]:
comments_encoded = np.array([dictionary[k] for k in comments])

In [74]:
comments[:100]

"'text': 'W tym się tylko stoi i pozuje rozumiem... bo chodzić się nie da na pewno!!!'\n'nick': 'lll'\n"

In [73]:
comments_encoded[:100]

array([  7,  81,  66,  85,  81,   7,  26,   1,   7,  53,   1,  81,  86,
        74,   1,  80,  70, 109,   1,  81,  86,  73,  72,  76,   1,  80,
        81,  76,  70,   1,  70,   1,  77,  76,  87,  82,  71,  66,   1,
        79,  76,  87,  82,  74,  70,  66,  74,  14,  14,  14,   1,  63,
        76,   1,  64,  69,  76,  65,  87,  70, 107,   1,  80,  70, 109,
         1,  75,  70,  66,   1,  65,  62,   1,  75,  62,   1,  77,  66,
        84,  75,  76,   2,   2,   2,   7,   0,   7,  75,  70,  64,  72,
         7,  26,   1,   7,  73,  73,  73,   7,   0])

###Creating batches

In [78]:
print(comments[:1000])

'text': 'W tym się tylko stoi i pozuje rozumiem... bo chodzić się nie da na pewno!!!'
'nick': 'lll'

'text': 'Tandetnie i tanio jak zawsze. '
'nick': 'Kasia '

'text': 'Blaszany drwal - samica z ochroniarzem.'
'nick': 'taaa'

'text': 'impreza charytatywna a ta sobie jaja robi...'
'nick': 'hie hie'

'text': 'KUKU NA MUNIU&gt; Następnym razem proponuję odziać się w złoty pas cnoty ... wykończony kryształkami. Powodzenia ... '
'nick': 'LEW'

'text': 'KUKU NA MUNIU. Następnym razem proponuję jako kreację złoty pas cnoty. '
'nick': 'Romontos'

'text': 'Kopia dody i Beyonce w kreacjach Thierrego Muglera. Gocha, on już nie żyje, nie będziesz jego muzą!  :)\n'
'nick': 'aniunia '

'text': 'Xena made in poland'
'nick': 'Bronia'

'text': 'Cena made in polandia'
'nick': 'Bronia'

'text': 'mialo byc swiatowo, wyszlo faszyn from Raszyn'
'nick': 'Doris'

'text': 'Ciekawe za czyją kasę te bale fundacja wyprawia i tylko dla wybranych. '
'nick': 'Gość '

'text': 'Coś strasznego, ona naprawdę ma coś nie 

In [129]:
typical_scheme = """'text': 'KUKU NA MUNIU&gt; Następnym razem proponuję odziać się w złoty pas cnoty ... wykończony kryształkami. Powodzenia ... '
'nick': 'LEW'

'text': 'KUKU NA MUNIU. Następnym razem proponuję jako kreację złoty pas cnoty. '
'nick': 'Romontos'"""

In [130]:
len(typical_scheme)

243

In [131]:
batch_size = 256

In [132]:
#number of batches (zero-indexing -> +1)
len(comments)//(batch_size+1)

4455

In [133]:
dataset =  Dataset.from_tensor_slices(comments_encoded)

# for i in dataset.take(200):
#      print(reverse_dictionary[i.numpy()])

In [134]:
sequences = dataset.batch(batch_size + 1, drop_remainder=True)

In [135]:
def inputs_and_targets(batch):
    input = batch[:-1]
    target = batch[1:]
    return input, target #create a tuple

In [136]:
dataset = sequences.map(inputs_and_targets)

In [137]:
for input, target in dataset.take(1):
    print(input.numpy())
    print(''.join(reverse_dictionary[input.numpy()]))
    print('\n')
    print(target.numpy())
    print(''.join(reverse_dictionary[target.numpy()]))

[  7  81  66  85  81   7  26   1   7  53   1  81  86  74   1  80  70 109
   1  81  86  73  72  76   1  80  81  76  70   1  70   1  77  76  87  82
  71  66   1  79  76  87  82  74  70  66  74  14  14  14   1  63  76   1
  64  69  76  65  87  70 107   1  80  70 109   1  75  70  66   1  65  62
   1  75  62   1  77  66  84  75  76   2   2   2   7   0   7  75  70  64
  72   7  26   1   7  73  73  73   7   0   0   7  81  66  85  81   7  26
   1   7  50  62  75  65  66  81  75  70  66   1  70   1  81  62  75  70
  76   1  71  62  72   1  87  62  84  80  87  66  14   1   7   0   7  75
  70  64  72   7  26   1   7  41  62  80  70  62   1   7   0   0   7  81
  66  85  81   7  26   1   7  32  73  62  80  87  62  75  86   1  65  79
  84  62  73   1  13   1  80  62  74  70  64  62   1  87   1  76  64  69
  79  76  75  70  62  79  87  66  74  14   7   0   7  75  70  64  72   7
  26   1   7  81  62  62  62   7   0   0   7  81  66  85  81   7  26   1
   7  70  74  77  79  66  87  62   1  64  69  62  7

In [138]:
# Buffer size to shuffle the dataset so it doesn't attempt to shuffle
# the entire sequence in memory. Instead, it maintains a buffer in which it shuffles elements
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

##Model

**Creating a model based on DeepMoji architecture.**

###Choosing of embedding dimention.

In [139]:
#how many characters occur more than 500 times
counter = Counter(comments)
a = {k: v for k, v in sorted(counter.items(), key=lambda item: item[1], reverse=True) if v > 500}
len(a)

75

In [140]:
embed_dim = 75

###Loss function tuning 

In [141]:
def sparse_cat_loss(y_true, y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

###Creating model

**Simple model performance verification**

In [142]:
model = Sequential()
model.add(Embedding(chars_num, embed_dim, batch_input_shape=[batch_size, None]))
model.add(GRU(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))    
model.add(Dense(chars_num))
model.compile(optimizer='adam', loss=sparse_cat_loss) 

In [144]:
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f147866da50>

In [146]:
# import pandas as pd
# from tensorflow.keras.models import load_model
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/Colab Notebooks/GIT_AI/
# model.save('simpeNLP.h5')
# # model = load_model('....h5')
# hist_json_file = 'simpeNLP.json' 
# with open(hist_json_file, mode='w') as f:
#      pd.DataFrame(model.history.history).to_json(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/GIT_AI


In [145]:
model_DM = Sequential()
model_DM.add(Embedding(chars_num, embed_dim, batch_input_shape=[batch_size, None]))
model_DM.add(LSTM(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
model_DM.add(LSTM(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))    
model_DM.add(Dense(chars_num)) #softmax?
model_DM.compile(optimizer='adam', loss=sparse_cat_loss) 

In [148]:
model_DM.fit(dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f1480935f50>

In [150]:
# import pandas as pd
# from tensorflow.keras.models import load_model
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/Colab Notebooks/GIT_AI/

# model_DM.save('DM_NLP.h5')

# # model = load_model('....h5')

# hist_json_file = 'DM_NLP.json' 
# with open(hist_json_file, mode='w') as f:
#      pd.DataFrame(model_DM.history.history).to_json(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/GIT_AI


##Comments Generator