# MapReduce

### Goal: find the 5 most frequent initials in baby names

### Step 1: distribute data


In [1]:
import pandas as pd

df = pd.read_csv('../01_Week_1_Pandas/yob2000.txt', 
                 names=['name', 'gender', 'count'])
df.shape

(29769, 3)

In [2]:
# create partitions
df1 = df.iloc[:7500] 
df2 = df.iloc[7500:15000]
df3 = df.iloc[15000:22500]
df4 = df.iloc[22500:]

df1.shape, df2.shape, df3.shape, df4.shape

((7500, 3), (7500, 3), (7500, 3), (7269, 3))

In [3]:
s1 = df1['name'].str[0].value_counts()
s2 = df2['name'].str[0].value_counts()

In [10]:
s1

A    975
K    788
M    757
J    617
S    609
C    537
L    410
D    406
T    382
R    311
B    281
E    250
N    242
H    135
I    131
G    129
Z    100
P     97
Y     94
F     77
V     70
O     35
W     22
Q     19
X     18
U      8
Name: name, dtype: int64

### Step 2: write a mapper function
Count all initials and return a dict

In [11]:
def mapper(df: pd.DataFrame) -> pd.Series:  # type annotations
    """counts the initials in one partition"""
    count = df['name'].str[0].value_counts()
    return count
    
# also see: mypy (a pylint for type annotations)

### Step 3: write a reducer function
Take two mapped dictionaries and merge them

In [12]:
def reduce(a: pd.Series, b: pd.Series) -> pd.Series:
    return a + b

###  Step 4:  Call everything
call the funtions, so that you obtain one dict in the end.

In [13]:
reduce(
    reduce(
        mapper(df1), mapper(df2)
    ),
    reduce(
        mapper(df3), mapper(df4)
    )
)

A    3316
B    1100
C    1838
D    2237
E     996
F     344
G     612
H     598
I     476
J    2791
K    2695
L    1354
M    2481
N    1046
O     273
P     390
Q     148
R    1316
S    2240
T    2025
U      66
V     269
W     199
X      66
Y     426
Z     467
Name: name, dtype: int64