# First names processing

## pyspark initialization

In [1]:
# Initialize pyspark
import os

def is_spark_initialized():
    try:
        if sc:
            return True
    except:
        pass
    return False

if is_spark_initialized():
     # SparkContext already initialized (eg. Notebook was launched from pyspark)
    pass
else:
    # /!\ Network interface must be active in order to initialize SparkContext
    os.environ["SPARK_LOCAL_IP"] = "127.0.0.1"
    execfile(os.path.join(os.environ["SPARK_HOME"], "python", "pyspark", "shell.py"))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.0
      /_/

Using Python version 2.7.10 (default, Dec  5 2015 11:51:52)
SparkSession available as 'spark'.


In [2]:
# Check sc variable
if sc:
    print "pyspark for Spark %s is initialized: %s" % (sc.version, sc.uiWebUrl)
else:
    print "/!\ pyspark not initialized"

pyspark for Spark 2.1.0 is initialized: http://127.0.0.1:4040


## First names

### Data source

Data source used: http://www.data.gouv.fr/fr/datasets/fichier-des-prenoms-edition-2016/

In [3]:
ds_root = "data/"
ds_staging = "staging/"
ds_filename = "dpt2015.txt"
os.environ["FIRSTNAMES_ROOT"] = ds_root
os.environ["FIRSTNAMES_STAGING"] = ds_staging

dpt2015 = sc.textFile(os.path.join(ds_root, ds_filename), use_unicode=True)

### Helpers

In [4]:
from unidecode import unidecode


def decode_firstname(firstname):
    return unidecode(firstname).encode("ascii").lower()


def decode_gender(gender):
    return int(gender)


def accumulate_gender(acc, gender):
    """
    1 = male, 2 = female, 3 = both
    (male, male) => male
    (female, female) => female
    (male, female) => both
    (female, male) => both
    """
    return (acc if acc == gender else 3) if acc else gender

### Prepare data source

1. Remove header.
2. Split each line into tokens.
3. Remove unexpected tokens.

In [5]:
# line => str[]
dpt2015_no_header = dpt2015 \
    .filter(lambda x: not x.startswith("sexe\tpreusuel")) \
    .map(lambda x: x.split()) \
    .filter(lambda x: x[1] != "_PRENOMS_RARES" and len(x[1]) > 1)

### Generate first names

In [6]:
# str[] => (firstname, None)
firstnames = \
    dpt2015_no_header \
    .map(lambda x: (decode_firstname(x[1]), None)) \
    .reduceByKey(lambda x, y: None)

In [None]:
%%!
rm -r $FIRSTNAMES_STAGING"firstnames.txt"

In [8]:
firstnames.coalesce(1) \
    .sortBy(lambda (k, v): k) \
    .map(lambda (k, v): k) \
    .saveAsTextFile(os.path.join(ds_staging, "firstnames.txt"))

### Generate first names with gender

In [9]:
# str[] => (firstname, gender)
firstnames_with_gender = \
     dpt2015_no_header \
    .map(lambda x: (decode_firstname(x[1]), decode_gender(x[0]))) \
    .reduceByKey(lambda x, y: accumulate_gender(x, y))

In [None]:
%%!
rm -r $FIRSTNAMES_STAGING"firstnames_with_gender.txt"

In [11]:
firstnames_with_gender.coalesce(1) \
    .sortBy(lambda (k, v): k) \
    .map(lambda (k, v): "%s\t%d" % (k, v)) \
    .saveAsTextFile(os.path.join(ds_staging, "firstnames_with_gender.txt"))

In [None]:
%%!
cp $FIRSTNAMES_STAGING"firstnames_with_gender.txt/part-00000" $FIRSTNAMES_ROOT"firstnames.txt"

## Cleaning

In [None]:
%%!
rm -r $FIRSTNAMES_STAGING"firstnames.txt"
rm -r $FIRSTNAMES_STAGING"firstnames_with_gender.txt"