In [2]:
import pyspark
from pyspark.sql import SparkSession
mongo_uri = "mongodb://admin:mongopw@mongo:27017/admin?authSource=admin"

spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.mongodb.input.uri", mongo_uri) \
      .config("spark.mongodb.output.uri", mongo_uri) \
      .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [77]:
#1 
plan = spark.read.json("file:///home/jovyan/datasets/british-empire/plantations/plantations.json").withColumnRenamed("Sheet Id*","P Sheet Id*")
ind = spark.read.json("file:///home/jovyan/datasets/british-empire/individuals/*.json")
comb = plan.join(ind, ind["Sheet Id*"]==plan["P Sheet Id*"], how="inner")
comb.printSchema()

root
 |-- Date of Registry (If Applicable): string (nullable = true)
 |-- Location (Parish): string (nullable = true)
 |-- Main Production: string (nullable = true)
 |-- Manager (If Applicable): string (nullable = true)
 |-- Number of Enslaved People: string (nullable = true)
 |-- Owner: string (nullable = true)
 |-- Plantation Name: string (nullable = true)
 |-- Sex of Owner: string (nullable = true)
 |-- P Sheet Id*: string (nullable = true)
 |-- Signature: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Corrections: string (nullable = true)
 |-- Couleur (Color): string (nullable = true)
 |-- Emplois (Employment): string (nullable = true)
 |-- Family (Y/N): string (nullable = true)
 |-- Female Parent (Individual ID): string (nullable = true)
 |-- Gender (M/F): string (nullable = true)
 |-- Individual ID: string (nullable = true)
 |-- Male Parent (Individual ID): string (nullable = true)
 |-- Marques (Marks): string (nullable = true)
 |-- Nom (First Name): string (nul

In [78]:
#2
comb.groupBy( comb['Number of Enslaved People']).count().show()

+-------------------------+-----+
|Number of Enslaved People|count|
+-------------------------+-----+
|                      234|  234|
|                      200|  200|
|                      250|  250|
|                      139|  139|
|                      203|  203|
|                      164|  164|
|                       68|   68|
|                      338|  338|
|                      247|  247|
|                      150|  150|
|                      153|  153|
|                      143|  143|
|                      134|  134|
|                      312|  312|
|                      151|  151|
+-------------------------+-----+



In [79]:
#3a
from pyspark.sql.functions import expr, explode,col, lower
color = spark.read.option("multiline",True).json("file:///home/jovyan/datasets/british-empire/lookup-data/color.json")
color.printSchema()
color.createOrReplaceTempView("colors")
# https://harshitjain.home.blog/2019/09/27/pivot-and-unpivot-a-spark-dataframe/
color.select(expr("stack(4,'Black',Black,'Griffe',Griffe,'Mestif',Mestif,'Mulatre',Mulatre) as (color, value)")).show()
query = '''
select stack(4,'Black',Black,'Griffe',Griffe,'Mestif',Mestif,'Mulatre',Mulatre) as (color, value)
from colors
'''
spark.sql(query).show()

root
 |-- Black: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Griffe: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Mestif: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Mulatre: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------+--------------------+
|  color|               value|
+-------+--------------------+
|  Black|[négre, négresse,...|
| Griffe|[griffe, capre, c...|
| Mestif|[mestif, mestive,...|
|Mulatre|[mulatre, mulatre...|
+-------+--------------------+

+-------+--------------------+
|  color|               value|
+-------+--------------------+
|  Black|[négre, négresse,...|
| Griffe|[griffe, capre, c...|
| Mestif|[mestif, mestive,...|
|Mulatre|[mulatre, mulatre...|
+-------+--------------------+



In [99]:
c2 = color.select(expr("stack(4,'Black',Black,'Griffe',Griffe,'Mestif',Mestif,'Mulatre',Mulatre) as (color, value)"))\
    .select(col("color").alias("Color*"), explode("value").alias("color"))
c2.show(15)

+------+---------------+
|Color*|          color|
+------+---------------+
| Black|          négre|
| Black|       négresse|
| Black|          black|
| Black|          negro|
| Black|          negre|
| Black|           noir|
| Black|       negresse|
| Black|   negre [sic?]|
| Black|  negre infirme|
| Black|    negre rouge|
| Black|negre rougeatre|
| Black|     negre[sse]|
| Black|        negrese|
| Black|       negresee|
| Black|        negress|
+------+---------------+
only showing top 15 rows



In [71]:
comb2 = comb.select(lower("Couleur (Color)").alias("Couleur (Color)"))
comb3 = comb2.join(c2, c2["color"]==comb2["Couleur (Color)"], how="left")
comb3.show(30)

+---------------+-------+-------+
|Couleur (Color)| Color*|  color|
+---------------+-------+-------+
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|            cap| Griffe|    cap|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|        capress| Griffe|capress|
|            cap| Griffe|    cap|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|          negro|  Black|  negro|
|        mulatto|Mulatre|mulatto|
|          negro|  Black|  negro|
|          neg

In [98]:
#4a?
comb4 = comb.select("Sheet Id*", "Individual ID","Gender (M/F)", "Male Parent (Individual ID)","Female Parent (Individual ID)")
comb4.show(10)
comb4.createOrReplaceTempView("parents")

query = '''
select `Sheet Id*`, `Male Parent (Individual ID)`, count(*) as male_children_count
    from parents
    where `Male Parent (Individual ID)` != '' 
    group by `Sheet Id*`, `Male Parent (Individual ID)`
'''
spark.sql(query).show()

+--------------------+-------------+------------+---------------------------+-----------------------------+
|           Sheet Id*|Individual ID|Gender (M/F)|Male Parent (Individual ID)|Female Parent (Individual ID)|
+--------------------+-------------+------------+---------------------------+-----------------------------+
|1_BIptA69xxh9uhPz...|            1|           M|                           |                             |
|1_BIptA69xxh9uhPz...|            2|           F|                           |                             |
|1_BIptA69xxh9uhPz...|            3|           M|                           |                             |
|1_BIptA69xxh9uhPz...|            4|           F|                           |                             |
|1_BIptA69xxh9uhPz...|            5|           F|                          3|                             |
|1_BIptA69xxh9uhPz...|            6|           M|                           |                             |
|1_BIptA69xxh9uhPz...|      

[Stage 211:>                                                        (0 + 1) / 1]

+--------------------+---------------------------+-------------------+
|           Sheet Id*|Male Parent (Individual ID)|male_children_count|
+--------------------+---------------------------+-------------------+
|1_BIptA69xxh9uhPz...|                         80|                  2|
|1QfG8wLP-MP-RE_ec...|                        124|                  1|
|1zGxny4JUmRR-j_vk...|               Julien (175)|                  2|
|1SE-Odib2kV-Gd8Sx...|                          2|                  1|
|1puOKEhbVxaekntK7...|                        102|                  2|
|1puOKEhbVxaekntK7...|                         33|                  1|
|1QfG8wLP-MP-RE_ec...|                          1|                  2|
|1QfG8wLP-MP-RE_ec...|                        162|                  1|
|1_BIptA69xxh9uhPz...|                        269|                  1|
|1SE-Odib2kV-Gd8Sx...|                         52|                  2|
|1QfG8wLP-MP-RE_ec...|                         61|                  4|
|12uGq

                                                                                