# Домашняя работа

Сделайте mapper и reducer, чтобы посчитать среднее и дисперсию оценок за фильм.

Реализация через цикл (предпогаем, что мы не знаем сколько у нас фильмов):

In [3]:
!pip install opendatasets



In [4]:
import opendatasets as od
import pandas as pd
import time
from pathlib import Path

In [5]:
dataset_path = Path('imdb-user-reviews', 'song_lyrics.csv')
if not dataset_path.is_file():
    od.download('https://www.kaggle.com/datasets/sadmadlad/imdb-user-reviews')

Skipping, found downloaded files in ".\imdb-user-reviews" (use force=True to force download)


In [6]:
import json
from functools import reduce


n, mean, M2 = 0, 0.0, 0
for path in Path('imdb-user-reviews').glob('**/*'):
    if path.is_file() and path.suffix == '.json':
        with open(path, 'r') as f:
            info = json.load(f)
        score = float(info['movieIMDbRating'])
        n += 1
        delta = score - mean
        mean += delta / n
        M2 += delta * (score - mean)

print(mean, (M2 / n) ** (1/2))

8.03 1.0517128885774865


In [7]:
def mapper(path):
    score = 0
    if path.is_file() and path.suffix == '.json':
        with open(path, 'r') as f:
            info = json.load(f)
        return(1,float(info['movieIMDbRating']),None)
    return None


def reducer(score1, score2):
    if score1 is None and score2 is None:
        return None
    elif score1 is None:
        return score2
    elif score2 is None:
        return score1
    else:
        scores = []
        if score1[2] == None:
            n, mean, M2 = 0, 0.0, 0
            scores.append(score1[1])
        else:
            n, mean, M2 = score1
        scores.append(score2[1])
        
        for score in scores:
            n += 1
            delta = score - mean
            mean += delta / n
            M2 += delta * (score - mean)
        return n, mean, M2

In [8]:
print(*map(mapper, Path('imdb-user-reviews').glob('**/*')))

None None None None None None None None None None (1, 8.4, None) None (1, 8.8, None) None (1, 7.4, None) None (1, 8.4, None) None (1, 5.2, None) None (1, 8.9, None) None (1, 8.3, None) None (1, 8.0, None) None (1, 9.0, None) None (1, 7.9, None) None


In [9]:
%%time
n, mean, M2 = reduce(reducer, map(mapper, Path('imdb-user-reviews').glob('**/*')))
print(n, mean, (M2 / n) ** (1/2))

10 8.03 1.0517128885774865
CPU times: total: 0 ns
Wall time: 5.01 ms


# Parallel

In [10]:
from joblib import Parallel, delayed

In [11]:
%%time
N = 2
n, mean, M2 = reduce(reducer, Parallel(n_jobs=N)(delayed(mapper)(path) for path in Path('imdb-user-reviews').glob('**/*')))
print(mean, (M2 / n) ** (1/2))

8.03 1.0517128885774865
CPU times: total: 0 ns
Wall time: 1.06 s


In [12]:
%%time
N = 8
n, mean, M2 = reduce(reducer, Parallel(n_jobs=N)(delayed(mapper)(path) for path in Path('imdb-user-reviews').glob('**/*')))
print(mean, (M2 / n) ** (1/2))

8.03 1.0517128885774865
CPU times: total: 78.1 ms
Wall time: 2.12 s


In [13]:
%%time
N = 16
n, mean, M2 = reduce(reducer, Parallel(n_jobs=N)(delayed(mapper)(path) for path in Path('imdb-user-reviews').glob('**/*')))
print(mean, (M2 / n) ** (1/2))

8.03 1.0517128885774865
CPU times: total: 31.2 ms
Wall time: 335 ms


# Chunkify (Почему-то работает не так как надо)

In [14]:
import multiprocessing
multiprocessing.cpu_count()

4

In [34]:
def chunkify(mapInfo, number_of_chunks=2):
    list_of_strings = [x for x in mapInfo if x.is_file()]
    list_of_strings=[str(x).replace('WindowsPath(','').replace('\\','/') for x in list_of_strings]
    step = len(list_of_strings) // number_of_chunks
    if step != 0:
        for i in range(0, len(list_of_strings), step):
            yield list_of_strings[i : i + step]
    else:
        yield list_of_strings  # Генераторы
        

def chunks_mapper(chunk):
    mapped_chunk = list(map(lambda x: mapper(Path(x)), chunk))
    print(*mapped_chunk)
    return reduce(reducer, mapped_chunk, None)


In [35]:
%%time
data_chunks = chunkify(Path('imdb-user-reviews').glob('**/*'),number_of_chunks=3)
#step 1:
mapped = map(chunks_mapper, data_chunks)
#step 2:
reduced = reduce(reducer, mapped,None)
print(reduced)

(1, 8.4, None) None (1, 8.8, None) None (1, 7.4, None) None
(1, 8.4, None) None (1, 5.2, None) None (1, 8.9, None) None
(1, 8.3, None) None (1, 8.0, None) None (1, 9.0, None) None
(1, 7.9, None) None
(6, 8.072222222222223, 1.5764814814814825)
CPU times: total: 15.6 ms
Wall time: 7.96 ms
