In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import shutil
import numpy as np
import cv2
from google.colab.patches import cv2_imshow
import urllib.request as ur #used for request images for avoiding problems with cv2 read

# Separate images into folders
separate =   False #@param {type: "boolean"}
# Base path where images and csv file are going to be stored
BASE_PATH = '/content' #@param {type: "string"}

def wikipedia_get_images(title, writer):
  session = requests.Session()

  # Using wikipedia api for obtaining a json from an article
  URL = "Ut"
  
  PARAMS = {
    "action": "parse",
    "format": "json",
    "page": title
  }

  request = session.get(url=URL, params=PARAMS)

  request_json = request.json()
  if "parse" in request_json:
    # Take the html content of the json article
    page = (request_json["parse"]["text"]["*"])

    soup = BeautifulSoup(page, 'html.parser')
    
    # Get only the divs where there are images
    thumb_divs = soup.findAll("div", {"class": "thumbinner"})

    images = []
    id=0

    if separate:
      # Create the directory where the images of the article are saved
      os.mkdir(BASE_PATH+'/images/'+title)
    
    # Iterate all the divs with images
    for div in thumb_divs:
      # Get the image url
      if div.findAll("img"):
        image_url = div.findAll("img")[0]['src']
        # Send a get request to the url
        image_response = requests.get('http:'+image_url, stream=True).raw
        # Get the caption of the image
        caption = div.findAll("div")[0].text
        # Generate the image id
        imageid = title+str(id)
        # If the image div has caption we save the image and its caption
        if caption:

          # Save the image file
          image_np = np.asarray(bytearray(image_response.read()), dtype="uint8")
          image_cv2 = cv2.imdecode(image_np, cv2.IMREAD_UNCHANGED)
          if image_cv2 is not None:
            img_stretch = cv2.resize(image_cv2, (64, 64))
            if separate:
              cv2.imwrite(BASE_PATH+'/images/'+title+'/'+imageid+'.png', img_stretch)
            else:
              cv2.imwrite(BASE_PATH+'/images/'+imageid+'.png', img_stretch)
            # Write the row with the imageid-caption pair
            row = [imageid, caption]
            writer.writerow(row)
            id+=1


# Array of the articles where images are taken
words_list = []
import csv
# Place a file articles.csv with the number of articles from which
# images will be downloaded (one article name on each line) 
if os.path.exists(BASE_PATH+'/articles.csv'):
  with open('articles.csv', newline='') as csvfile:
    for row in csvfile:
      words_list.append(row.strip())
else:
  words_list = ['cat', 'dog', 'bird', 'cattle', 'monkey']
print(words_list)

if os.path.exists(BASE_PATH+'/images'):
  shutil.rmtree(BASE_PATH+'/images')
os.mkdir(BASE_PATH+'/images')

# Generate csv for imageid-caption pairs
file = open(BASE_PATH+'/captions.csv', 'w')
header = ['image_id', 'caption']
writer = csv.writer(file)
writer.writerow(header)
# Iterate the array of articles' name
for title in words_list:
  # Download every image of the article
  wikipedia_get_images(title, writer)
  
file.close()



['bacon', 'beef', 'chicken', 'cooked meat', 'duck', 'ham', 'kidneys', 'lamb', 'liver', 'potato', 'salami', 'sausages', 'pork', 'veal', 'apple', 'apricot', 'banana', 'blackberry', 'blackcurrant', 'blueberry', 'cherry', 'coconut', 'fig', 'gooseberry', 'grape', 'grapefruit', 'kiwi', 'lemon', 'lime', 'mango', 'melon', 'orange', 'pizza', 'burger', 'burrito', 'kebab', 'pasta', 'salad', 'peach', 'pear', 'pineapple', 'plum', 'pomegranate', 'raspberry', 'redcurrant', 'rhubarb', 'strawberry', 'anchovy', 'cod', 'haddock', 'herring', 'kipper', 'mackerel', 'pilchard', 'plaice', 'salmon', 'sardine', 'sole', 'trout', 'tuna', 'biscuits', 'chocolate', 'crisps', 'hummus', 'nuts', 'olives', 'peanuts', 'sweets', 'walnuts', 'ketchup', 'mayonnaise', 'mustard', 'pepper', 'salt', 'vinaigrette', 'vinegar', 'cereal', 'cornflakes', 'honey', 'jam', 'marmalade', 'muesli', 'porridge', 'toast', 'kangaroo', 'eggs', 'lobster', 'fish', 'turkey', 'fast food', 'onion fries', 'french fries', 'fried chicken', 'taco', 'nood

In [None]:
# Copy to Drive
from google.colab import drive
drive.mount('/content/drive')
!cp -r /content/images /content/drive/MyDrive/"Colab Notebooks"/TFG/dataset
!cp /content/captions.csv /content/drive/MyDrive/"Colab Notebooks"/TFG/dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
