# Diafilm parsing



## Fetch diafilms

And save to html pages for later parsing

In [None]:
import csv
import requests
import time

In [None]:
csv_file = None

with open('data/diafilms-v1.csv', mode ='r') as file:   
    csv_file = csv.DictReader(file)

    for row in csv_file:
        time.sleep(0.5)
        print(f"Downloading {row['name']} page from {row['url']}")
        r = requests.get(row["url"])
        with open(f'html/index-{row["id"]}.html', 'w') as f:
            f.write(r.text)
        

# Parse webpages 

In [None]:
import glob
import re
from bs4 import BeautifulSoup


In [None]:

# Read all pages in html folder
pages = glob.glob("html/*.html")

site_url = 'https://diafilmy.su'
new_data = []

for p in range(len(pages)):
    with open(pages[p], 'r') as page:

        dict = {
            "id": re.findall('\d+',page.name)[0],
            "img": [],
            "img-cover": "",
            "description": "",
            "categories": []
        }

        soup = BeautifulSoup(page.read(), 'html.parser')
        title = soup.find(id='news-title').get_text()
        img_cover = soup.find(property='og:image').get('content')
        desc = soup.find(property='og:description').get('content')
        slide = soup.find(class_='cycle-slideshow')
        txt_with_category = soup.find(class_='berrorstxt')

        # Set cover image 
        dict["img-cover"] = img_cover

        # Set description
        dict["description"] = desc

        # Set categories
        for a in txt_with_category.find_all(href=re.compile('diafilmy.su/diafilmy/')):
            dict["categories"].append(a.get_text().strip())
        
        # Get gallery images 
        for img in slide.find_all('img'):
            dict["img"].append(site_url + img.get('src'))

        new_data.append(dict)

print(new_data[5])


## Save data to json 

In [None]:
import json 


In [None]:
with open('data/post-metadata.json', 'w+') as f:
    json.dump(new_data, f)

## GlueData™


In [12]:
# read and parse JSON file
new_json = open('data/post-metadata.json', mode='r')
data = new_json.read()
obj = json.loads(data)

# read and parse CSV file and use list instead of dictionary
csv_file = open('data/diafilms-v1.csv', mode='r')
csvr = csv.reader(csv_file)
csvr = list(csvr)

# ID Lookup working test         
index = 5235
json_obj = obj[index]
csv_lookup = csvr[int(json_obj["id"])]
# print(json_obj,csv_lookup)

for i, name in enumerate(csvr):
    # Index 0 in csv is fieldnames anyways 
    # so len(csvr) - 1 = len(obj)
    if i == 0:
        print("ID \t Fieldname")
        for j in range(len(name)):
            print(f"{j} \t {name[j]}")
    else:
        try: 
            i_lookup = int(obj[i-1]["id"])
            obj[i-1]["name"] = csvr[i_lookup][1]
            obj[i-1]["url"] = csvr[i_lookup][2]
            obj[i-1]["studio"] = csvr[i_lookup][3]
            obj[i-1]["year"] = csvr[i_lookup][4]
            obj[i-1]["color"] = csvr[i_lookup][5]
            obj[i-1]["type"] = csvr[i_lookup][6]
            obj[i-1]["frames"] = csvr[i_lookup][7]
            obj[i-1]["index"] = csvr[i_lookup][8]
            obj[i-1]["number"] = csvr[i_lookup][9]
            obj[i-1]["author"] = [
                x.strip() for x in csvr[i_lookup][10].split(',')]
            obj[i-1]["artist"] = [
                x.strip() for x in csvr[i_lookup][11].split(',')]
            obj[i-1]["designer"] = [
                x.strip() for x in csvr[i_lookup][12].split(',')]
            obj[i-1]["editor"] = [
                x.strip() for x in csvr[i_lookup][13].split(',')]
            obj[i-1]["artistic-editor"] = [
                x.strip() for x in csvr[i_lookup][14].split(',')]
            obj[i-1]["photographer"] = [
                x.strip() for x in csvr[i_lookup][15].split(',')]
            obj[i-1]["film"] = csvr[i_lookup][16]
            obj[i-1]["quality"] = csvr[i_lookup][17]
        except:
            print("BAD INDEX", i)


ID 	 Fieldname
0 	 id
1 	 name
2 	 url
3 	 studio
4 	 year
5 	 color
6 	 type
7 	 frames
8 	 index
9 	 number
10 	 author
11 	 artist
12 	 designer
13 	 editor
14 	 artistic-editor
15 	 photographer
16 	 film
17 	 quality


In [None]:
img_count = 0
for i in range(len(obj)):
    for j in range(len(obj[i]["img"])):
        img_count +=1

print("Number of images", img_count)

## Save to new file 

In [None]:
with open('data/post-metadata-all.json', 'w+') as f:
    json.dump(obj, f)
with open('data/post-metadata-all-mini.json', 'w+') as f:
    obj_mini = obj[0:100]
    json.dump(obj_mini, f)

# Database Import

In [1]:
import django_for_jupyter
import json
import csv
import re
from django.contrib.auth import get_user_model
from diafilms.models import Film, Image, Frame, FilmCover
from posts.models import TagCategory, Tag, GroupCategory
# !pip install transliterate
from transliterate import translit

# Get User
User = get_user_model()
user = User.objects.get(username='diafilm')

# read and parse full JSON file
new_json = open('data/post-metadata-all.json', mode='r')
data = new_json.read()
obj = json.loads(data)
print(obj[112]['name'])

# Film.objects.all().delete()
# Image.objects.all().delete()
# Frame.objects.all().delete()
# FilmCover.objects.all().delete()

def translitSlug(string):
    string = translit(string, 'ru', reversed=True).lower()
    string ='-'.join(string.split())   
    return string


Основы композиции в художественном конструировании


## Import into Models

In [4]:
for i in obj:
    if len(i['categories']) == 0:
        print(i['id'], i['name'], i['categories'])

# defualt_gr = GroupCategory(
#     name='Диафильмы^2',
#     slug='diafilms2',
#     description='Диафильмы^2',
# )

# defualt_gr.save()
# test_gr = GroupCategory.objects.filter(name='Диафильмы^2')
# print(test_gr)


<QuerySet [<GroupCategory: Диафильмы^2>]>


In [2]:
defualt_gr = GroupCategory.objects.create(
    name='Диафильмы^2',
    slug='diafilms2',
    description='Диафильмы^2',
)

for i in obj:
    if not Film.objects.filter(id=i['id']).exists():
        print(f'Adding #{i["id"]} - {i["name"]}')
        text_not_empty = i['description']
        if len(text_not_empty) == 0:
            text_not_empty = i['name']
        f = Film.objects.create(
            author=user,
            id= int('0'+i['id']),
            name = i['name'],
            url = i['url'],
            studio = i['studio'],
            year = int('0'+i['year']),
            color = i['color'],
            film_type = i['type'],
            index = i['index'],
            number = i['number'],
            film_name = i['film'],
            quality = i['quality'],
            text = text_not_empty,
        )
        
        for group in i['categories']:
            gr = None
            if not GroupCategory.objects.filter(name=group).exists():
                print(f'INFO: Creating new PostCategory {group}')
                gr = GroupCategory.objects.create(
                    name=group,
                    slug=translitSlug(group))
                gr.save()
            else:
                gr = GroupCategory.objects.get(name=group)  
            f.groups.add(gr)

#         if len(i['categories']) > 0:
#             pass
#         else:
#             f.groups.add(defualt_gr)
        

        # Create dict for Foreign keys
        tag_categories = {
            'author': i['author'],
            'artist': i['artist'],
            'designer': i['designer'],
            'editor': i['editor'],
            'artistic_editor': i['artistic-editor'],
            'photographer': i['photographer'],
#             'categories': i['categories'],
        }

        for c, cat in tag_categories.items():
            select_cat = None
            if not TagCategory.objects.filter(name=c).exists():
                # print(f'INFO: Creating new TagCategory {c}')
                select_cat = TagCategory.objects.create(
                    name = c,
                    slug = translitSlug(c)
                )
            else:
                select_cat = TagCategory.objects.get(name=c)
            select_cat.save()
            for tag in cat:
                if tag != "":
                    select_tag = None
                    if not Tag.objects.filter(name=tag, category=select_cat).exists():
                        # print(f'INFO: Creating new tag {tag} for category {c}')
                        select_tag = Tag.objects.create(
                            name = tag,
                            slug = translitSlug(tag),
                            category = select_cat,
                        )
                    else:
                        select_tag = Tag.objects.get(name=tag, category=select_cat)
                    f.tags.add(select_tag)
            f.category = select_cat

        # Create frames
        for key in range(len(i['img'])):
            fr = Frame.objects.create(
                url = i['img'][key],
                external = True,
                film = f,
                sequence = key,
            )
        
        # Ccheck if image exists for cover image
        img = None
        try:
            img = Image.objects.filter(url=i['img-cover'])[0]
        except:
            img = Image(url = i['img-cover'])
            img.save()
            print('image not found:', i['img-cover'])
        
        # Create cover image
        film_cover = FilmCover.objects.create(
            film=f,
            image=img
        )


Adding #4032 - Скульптор Федот Шубин


ValueError: "<Film: Скульптор Федот>" needs to have a value for field "id" before this many-to-many relationship can be used.

## Check

In [11]:
## Tags and Tag fix into Models
film_test = Film.objects.all()

for f in film_test:
    print('\n', f.id, f.name)
    for t in f.tags.all():
        print(t.category, t.name)



 5472 Почему рыбы стали нарядными
author Мишутин Н.
artist Афанасьева Н.
editor Семибратова Т.
artistic_editor Морозов А.
categories Сказки

 5292 Три мушкетера. Часть 4
author Дюма А.
artist Белей И.
editor Попова Л.
categories Повести и рассказы

 5071 Почему на севере ночь длинная?
author Наумов Е.
artist Кульков В.

 4975 Защита подземных кабелей от электрокоррозии
categories Образовательные и учебные
author Васильев В.
author Мамичев В.
designer Гиричева Ж.
editor Махмудбек Ю.
photographer Панова А.

 4351 Берберийский лев
artistic_editor Морозов А.
categories Повести и рассказы
author Мошковский А.
artist Лемкуль Ф.
editor Гуревич Л.

 4226 Русская народная игрушка
author Алёхин А.
designer Степанов Н.
editor Книжникова Л.
photographer Перевезенцева Т.
categories Образовательные и учебные

 4208 Новогодняя ёлка
editor Книжникова Л.
categories Образовательные и учебные
artistic_editor Морозов А.
author Суворова Г.
artist Самойлов В.

 4032 Скульптор Федот Шубин
author Нейман М.
d