# Diafilm parsing



## Fetch diafilms

And save to html pages for later parsing

In [None]:
import csv
import requests
import time

In [None]:
csv_file = None

with open('data/diafilms-v1.csv', mode ='r') as file:   
    csv_file = csv.DictReader(file)

    for row in csv_file:
        time.sleep(0.5)
        print(f"Downloading {row['name']} page from {row['url']}")
        r = requests.get(row["url"])
        with open(f'html/index-{row["id"]}.html', 'w') as f:
            f.write(r.text)
        

# Parse webpages 

In [None]:
import glob
import re
from bs4 import BeautifulSoup


In [None]:

# Read all pages in html folder
pages = glob.glob("html/*.html")

site_url = 'https://diafilmy.su'
new_data = []

for p in range(len(pages)):
    with open(pages[p], 'r') as page:

        dict = {
            "id": re.findall('\d+',page.name)[0],
            "img": [],
            "img-cover": "",
            "description": "",
            "categories": []
        }

        soup = BeautifulSoup(page.read(), 'html.parser')
        title = soup.find(id='news-title').get_text()
        img_cover = soup.find(property='og:image').get('content')
        desc = soup.find(property='og:description').get('content')
        slide = soup.find(class_='cycle-slideshow')
        txt_with_category = soup.find(class_='berrorstxt')

        # Set cover image 
        dict["img-cover"] = img_cover

        # Set description
        dict["description"] = desc

        # Set categories
        for a in txt_with_category.find_all(href=re.compile('diafilmy.su/diafilmy/')):
            dict["categories"].append(a.get_text().strip())
        
        # Get gallery images 
        for img in slide.find_all('img'):
            dict["img"].append(site_url + img.get('src'))

        new_data.append(dict)

print(new_data[5])


## Save data to json 

In [None]:
import json 


In [None]:
with open('data/post-metadata.json', 'w+') as f:
    json.dump(new_data, f)

## GlueData™


In [None]:
# read and parse JSON file
new_json = open('data/post-metadata.json', mode='r')
data = new_json.read()
obj = json.loads(data)

# read and parse CSV file and use list instead of dictionary
csv_file = open('data/diafilms-v1.csv', mode='r')
csvr = csv.reader(csv_file)
csvr = list(csvr)

# ID Lookup working test         
index = 5235
json_obj = obj[index]
csv_lookup = csvr[int(json_obj["id"])]
# print(json_obj,csv_lookup)

for i, name in enumerate(csvr):
    # Index 0 in csv is fieldnames anyways 
    # so len(csvr) - 1 = len(obj)
    if i == 0:
        print("ID \t Fieldname")
        for j in range(len(name)):
            print(f"{j} \t {name[j]}")
    else:
        try: 
            i_lookup = int(obj[i-1]["id"])
            obj[i-1]["name"] = csvr[i_lookup][1]
            obj[i-1]["url"] = csvr[i_lookup][2]
            obj[i-1]["studio"] = csvr[i_lookup][3]
            obj[i-1]["year"] = csvr[i_lookup][4]
            obj[i-1]["color"] = csvr[i_lookup][5]
            obj[i-1]["type"] = csvr[i_lookup][6]
            obj[i-1]["frames"] = csvr[i_lookup][7]
            obj[i-1]["index"] = csvr[i_lookup][8]
            obj[i-1]["number"] = csvr[i_lookup][9]
            obj[i-1]["author"] = [
                x.strip() for x in csvr[i_lookup][10].split(',')]
            obj[i-1]["artist"] = [
                x.strip() for x in csvr[i_lookup][11].split(',')]
            obj[i-1]["designer"] = [
                x.strip() for x in csvr[i_lookup][12].split(',')]
            obj[i-1]["editor"] = [
                x.strip() for x in csvr[i_lookup][13].split(',')]
            obj[i-1]["artistic-editor"] = [
                x.strip() for x in csvr[i_lookup][14].split(',')]
            obj[i-1]["photographer"] = [
                x.strip() for x in csvr[i_lookup][15].split(',')]
            obj[i-1]["film"] = csvr[i_lookup][16]
            obj[i-1]["quality"] = csvr[i_lookup][17]
        except:
            print("BAD INDEX", i)


In [None]:
img_count = 0
for i in range(len(obj)):
    for j in range(len(obj[i]["img"])):
        img_count +=1

print("Number of images", img_count)

## Save to new file 

In [None]:
with open('data/post-metadata-all.json', 'w+') as f:
    json.dump(obj, f)
with open('data/post-metadata-all-mini.json', 'w+') as f:
    obj_mini = obj[0:100]
    json.dump(obj_mini, f)

# Database Import

In [1]:
import django_for_jupyter
import json
import csv


In [2]:
# read and parse full JSON file
new_json = open('data/post-metadata-all.json', mode='r')
data = new_json.read()
obj = json.loads(data)

obj[112]['name']

'Основы композиции в художественном конструировании'

In [3]:
from diafilms.models import Film, Image, Frame, FilmCover, Category, Tag

In [15]:
Film.objects.all().delete()
Image.objects.all().delete()
Frame.objects.all().delete()
FilmCover.objects.all().delete()
Category.objects.all().delete()
Tag.objects.all().delete()

(0, {})

## Import into Models

In [None]:


for i in obj:
    if not Film.objects.filter(id=i['id']).exists():
        print(f'Adding #{i["id"]} - {i["name"]}')
        f = Film(
            id= int('0'+i['id']),
            name = i['name'],
            url = i['url'],
            studio = i['studio'],
            year = int('0'+i['year']),
            color = i['color'],
            type = i['type'],
            index = i['index'],
            number = i['number'],
            film = i['film'],
            quality = i['quality'],
            description = i['description'],
        )

        # Create dict for Foreign keys

        categories = {
            'author': i['author'],
            'artist': i['artist'],
            'designer': i['designer'],
            'editor': i['editor'],
            'artistic_editor': i['artistic-editor'],
            'photographer': i['photographer'],
            'categories': i['categories'],
        }

        for c, cat in categories.items():
            select_cat = None
            if not Category.objects.filter(name=c).exists():
                # print(f'Creating new category {c}')
                select_cat = Category(
                    name = c,
                )
            else:
                select_cat = Category.objects.filter(name=c)[0]

            select_cat.save()

            for tag in cat:
                select_tag = None
                if not Tag.objects.filter(name=tag).exists():
                    # print(f'Creating new tag {tag} for category {c}')
                    select_tag = Tag(
                        name = tag,
                        category = select_cat,
                    )
                else:
                    select_tag = Tag.objects.filter(name=tag)[0]
                select_tag.save()
            

            f.category = select_cat

        f.save()

        # Create frames
        for key in range(len(i['img'])):
            fr = Frame(
                url = i['img'][key],
                external = True,
                film = f,
                sequence = key,
            )
            fr.save()
        
        # Ccheck if image exists for cover image
        img = None
        try:
            img = Image.objects.filter(url=i['img-cover'])[0]
        except:
            img = Image(url = i['img-cover'])
            img.save()
            print('image not found:', i['img-cover'])
        
        # Create cover image
        film_cover = FilmCover(film=f, image=img)
        film_cover.save()




In [None]:
test = Film.objects.filter(id=2344)[0]

print(test.name)

## Faster `bulk_create()` Version

### Create Films

Creates a sorted array where index 0 is empty

In [10]:
films = [0]*int(len(obj))
# Create films
for i in obj:
    index = int('0'+i['id'])-1
    f = Film(
        id= index,
        name = i['name'],
        url = i['url'],
        studio = i['studio'],
        year = int('0'+i['year']),
        color = i['color'],
        type = i['type'],
        index = i['index'],
        number = i['number'],
        film = i['film'],
        quality = i['quality'],
        description = i['description'],
    )

    films[index] = f

### Create Tags and Categories

In [11]:

categories = ['author','artist','designer','editor','artistic-editor','photographer','categories']
categories = {categories[k]: {'obj': {}, 'tags': {}} for k in range(len(categories))}

for c, cat in categories.items():
    new_c = Category(name = c)
    categories[c]['obj'] = new_c


for i in obj:
    index = int('0'+i['id'])-1
    for c, cat in categories.items():
        # new_cat = Category(name = c)
        for tag in i[c]:
            if tag not in categories[c]['tags'] and tag != '':
                t = Tag(name = tag,category = cat['obj'])
                categories[c]['tags'][tag] = t

for c, cat in categories.items():
    print(f"{c} has {len(cat['tags'])} tags.")

print(films[123])

author has 2432 tags.
artist has 952 tags.
designer has 122 tags.
editor has 214 tags.
artistic-editor has 41 tags.
photographer has 208 tags.
categories has 8 tags.
Умей действовать в очаге поражения


### Create Frames

In [12]:
frames = []
for i in obj:
    for key in range(len(i['img'])):
        index = int('0'+i['id'])-1
        fr = Frame(
            url = i['img'][key],
            external = True,
            film = films[index],
            sequence = key,
        )
        frames.append(fr)

print(frames[12313].film)
        

Паучок пилот


### Create Image covers

In [13]:
covers = [0]*int(len(obj))
for i in obj:
    index = int('0'+i['id'])-1
    covers[index] = Image(url = i['img-cover'])

print(covers[123].url)

https://diafilmy.su/uploads/posts/2011-03/1299943252_5.jpg


In [None]:
Film.objects.bulk_create(objs=films, batch_size=500)

In [14]:
Film.objects.bulk_create(objs=films, batch_size=500)

ревнование и режим экономии, бережное отношение к народному добру>,
 <Film: Москва и москвичи>,
 <Film: Стимулирование научно технического прогресса>,
 <Film: Наша социалистическая экономика. Экономическая стратегия партии>,
 <Film: В гостях у юной хозяйки>,
 <Film: Общие закономерности и особенности развития социализма в различных странах>,
 <Film: Современная эпоха - эпоха перехода от капитализма к социализму>,
 <Film: Одиссея юного капитана>,
 <Film: Фотоэкран №4 1974г.>,
 <Film: Лисичка-сестричка и серый волк>,
 <Film: Жил-был ёжик>,
 <Film: У солнышка в гостях>,
 <Film: Белозубка>,
 <Film: Сампо-Лопаренок>,
 <Film: Ну, погоди! 1-й выпуск>,
 <Film: Социалистический путь природопользования>,
 <Film: Два вола с горошину>,
 <Film: Происхождение человека>,
 <Film: Огненный цветок>,
 <Film: Приёмы рубки, правки и гибки металла. Часть 1>,
 <Film: Товарное производство при капитализме>,
 <Film: Большая семья>,
 <Film: Ну, погоди! 7-й выпуск>,
 <Film: Артиллерист-чапаевец>,
 <Film: Фотоэкр