# Домашняя работа

Сделайте mapper и reducer, чтобы посчитать среднее и дисперсию оценок за фильм.

Реализация через цикл (предпогаем, что мы не знаем сколько у нас фильмов):

In [1]:
!pip install opendatasets



In [1]:
import opendatasets as od
import pandas as pd
import time
from pathlib import Path

In [2]:
dataset_path = Path('imdb-user-reviews', 'song_lyrics.csv')
if not dataset_path.is_file():
    od.download('https://www.kaggle.com/datasets/sadmadlad/imdb-user-reviews')

Skipping, found downloaded files in ".\imdb-user-reviews" (use force=True to force download)


In [3]:
import json
from functools import reduce


n, mean, M2 = 0, 0.0, 0
for path in Path('imdb-user-reviews').glob('**/*'):
    if path.is_file() and path.suffix == '.json':
        with open(path, 'r') as f:
            info = json.load(f)
        score = float(info['movieIMDbRating'])
        n += 1
        delta = score - mean
        mean += delta / n
        M2 += delta * (score - mean)

print(mean, (M2 / n) ** (1/2))

8.03 1.0517128885774865


In [71]:
def mapper(path):
    score = 0
    if path.is_file() and path.suffix == '.json':
        with open(path, 'r') as f:
            info = json.load(f)
        return(1,float(info['movieIMDbRating']),None)
    return None


def reducer(score1, score2):
    if score1 is None and score2 is None:
        return None
    elif score1 is None:
        return score2
    elif score2 is None:
        return score1
    else:
        scores = []
        if score1[2] == None:
            n, mean, M2 = 0, 0.0, 0
            scores.append(score1[1])
        else:
            n, mean, M2 = score1
        scores.append(score2[1])
        
        for score in scores:
            n += 1
            delta = score - mean
            mean += delta / n
            M2 += delta * (score - mean)
        return n, mean, M2

In [72]:
print(*map(mapper, Path('imdb-user-reviews').glob('**/*')))

None None None None None None None None None None (1, 8.4, None) None (1, 8.8, None) None (1, 7.4, None) None (1, 8.4, None) None (1, 5.2, None) None (1, 8.9, None) None (1, 8.3, None) None (1, 8.0, None) None (1, 9.0, None) None (1, 7.9, None) None


In [76]:
%%time
n, mean, M2 = reduce(reducer, map(mapper, Path('imdb-user-reviews').glob('**/*')))
print(n, mean, (M2 / n) ** (1/2))

10 8.03 1.0517128885774865
CPU times: total: 0 ns
Wall time: 4.99 ms


# Parallel

In [80]:
from joblib import Parallel, delayed

In [83]:
%%time
N = 2
n, mean, M2 = reduce(reducer, Parallel(n_jobs=N)(delayed(mapper)(path) for path in Path('imdb-user-reviews').glob('**/*')))
print(mean, (M2 / n) ** (1/2))

8.03 1.0517128885774865
CPU times: total: 78.1 ms
Wall time: 1.27 s


In [81]:
%%time
N = 8
n, mean, M2 = reduce(reducer, Parallel(n_jobs=N)(delayed(mapper)(path) for path in Path('imdb-user-reviews').glob('**/*')))
print(mean, (M2 / n) ** (1/2))

8.03 1.0517128885774865
CPU times: total: 0 ns
Wall time: 28.9 ms


In [82]:
%%time
N = 16
n, mean, M2 = reduce(reducer, Parallel(n_jobs=N)(delayed(mapper)(path) for path in Path('imdb-user-reviews').glob('**/*')))
print(mean, (M2 / n) ** (1/2))

8.03 1.0517128885774865
CPU times: total: 46.9 ms
Wall time: 115 ms


# Multiprocessing

In [84]:
import multiprocessing
multiprocessing.cpu_count()

4

In [102]:
def chunkify(mapInfo, number_of_chunks=2):
    step = len(mapInfo)
    if step != 0:
        for i in range(0, len(mapInfo), step):
            yield mapInfo[i : i + step]
    else:
        yield mapInfo  # Генераторы

def chunks_mapper(chunk):
    mapped_chunk = map(mapper, chunk)
    mapped_chunk = zip(chunk, mapped_chunk)
    return reduce(reducer, mapped_chunk)


In [109]:
%%time
data_chunks = chunkify([*Path('imdb-user-reviews').glob('**/*')],number_of_chunks=2)
print(*data_chunks)
#step 1:
mapped = map(chunks_mapper, data_chunks)
print(*mapped)
#step 2:
# reduced = reduce(reducer, mapped)
# print(reduced)

[WindowsPath('imdb-user-reviews/Avengers Endgame'), WindowsPath('imdb-user-reviews/Forrest Gump'), WindowsPath('imdb-user-reviews/John Wick Chapter 3  Parabellum'), WindowsPath('imdb-user-reviews/Joker'), WindowsPath('imdb-user-reviews/Morbius'), WindowsPath('imdb-user-reviews/Pulp Fiction'), WindowsPath('imdb-user-reviews/SpiderMan No Way Home'), WindowsPath('imdb-user-reviews/The Avengers'), WindowsPath('imdb-user-reviews/The Dark Knight'), WindowsPath('imdb-user-reviews/Thor Ragnarok'), WindowsPath('imdb-user-reviews/Avengers Endgame/metadata.json'), WindowsPath('imdb-user-reviews/Avengers Endgame/movieReviews.csv'), WindowsPath('imdb-user-reviews/Forrest Gump/metadata.json'), WindowsPath('imdb-user-reviews/Forrest Gump/movieReviews.csv'), WindowsPath('imdb-user-reviews/John Wick Chapter 3  Parabellum/metadata.json'), WindowsPath('imdb-user-reviews/John Wick Chapter 3  Parabellum/movieReviews.csv'), WindowsPath('imdb-user-reviews/Joker/metadata.json'), WindowsPath('imdb-user-reviews