In [1]:
import pandas as pd 
pd.options.display.max_columns = 1000

### Person data

In [236]:
df_person = pd.read_parquet("scrape_wikidata/processed_data/step_1_one_line_per_person/page000_0_to_2000.parquet").head(2)
df_person

Unnamed: 0,human,country_citizen,given_name,family_name,dob,place_birth,birth_coordinates,birth_country,humanlabel,humanaltlabel,given_namelabel,family_namelabel,humandescription,place_birthlabel,country_citizenlabel,sex_or_genderlabel,birth_countrylabel,birth_name,residence,residence_coordinates,pseudonym,ethnicity,ethnicitylabel,residencelabel,residence_countrylabel
47,Q10531,Q766 | Q145,Q14203378,Q21449749,1958-02-01T00:00:00Z,Q1394786,Point(-77.655833333 18.493611111),Q766,Luther Blissett,,Luther,Blissett,"English association football player, coach and...",Falmouth,Jamaica | United Kingdom,male,Jamaica,,,,,,,,
62,Q38082,Q174193 | Q145,Q2958359 | Q768196,Q5287926 | Q2666084,1832-01-27T00:00:00Z,Q990170,Point(-2.631998 53.340406),Q145,Lewis Carroll,"Rev. C. L. Dodgson, Charles Dodgson, Charles L...",Charles | Lewis,Dodgson | Carroll,"English writer, logician, Anglican deacon and ...",Daresbury,United Kingdom of Great Britain and Ireland | ...,male,United Kingdom,Charles Lutwidge Dodgson,Q21,Point(-1.0 53.0),Lewis Carroll,Q42406,English people,England,United Kingdom


### Names

In [3]:
df_names = pd.read_parquet("scrape_wikidata/raw_data/names/stbtsa_page_0_0_to_4999.parquet").head(2)
df_names

Unnamed: 0,given_name,original_name,alt_name,name_variant_type
0,Q24969611,Sherie,Sherry,said_to_be_the_same_as
1,Q24969585,LaShawn,LeSean,said_to_be_the_same_as


### Person postcode lookup

In [4]:
df_postcode = pd.read_parquet("scrape_wikidata/processed_data/step_2_person_postcode_lookups/page000_0_to_2000.parquet")
df_postcode[df_postcode["person"].isin(df_person["human"])]

Unnamed: 0,point,person,nearby_postcodes
1,Point(-2.631998 53.340406),Q38082,"[WA4 4FS, WA4 4FX, WA4 4AD, WA4 4AB, WA4 6ST]"
2,Point(-1.0 53.0),Q38082,"[NG14 7EN, NG14 7PA, NG14 7EP, NG14 7DP, NG14 ..."


## Addresses

In [268]:
df_addresses = pd.read_parquet("scrape_wikidata/processed_data/step_5_addresses/addresses_as_array/").head(3)
df_addresses


Unnamed: 0,postcode,address_array
0,AL1 1DH,"[{""house_number_paon"":""92"",""street"":""HOLYWELL ..."
1,AL1 1TG,"[{""house_number_paon"":""9"",""street"":""NEW BARNES..."
2,AL1 2JE,"[{""house_number_paon"":""4"",""street"":""GLENGALL P..."


## Master data

In [None]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [243]:
df_person = spark.read.parquet("scrape_wikidata/processed_data/step_1_one_line_per_person/page000_0_to_2000.parquet")
df_person.createOrReplaceTempView("df_person")

In [244]:
df_names = spark.read.parquet("scrape_wikidata/raw_data/names/")
df_names.createOrReplaceTempView("df_names")

In [245]:
df_point_postcode = spark.read.parquet("scrape_wikidata/processed_data/step_2_person_postcode_lookups/page000_0_to_2000.parquet")
df_point_postcode.createOrReplaceTempView("df_point_postcode")

In [275]:
df_addresses_array = spark.read.parquet("scrape_wikidata/processed_data/step_5_addresses/addresses_as_array")
df_addresses_array.createOrReplaceTempView("df_addresses_array")
spark.sql("select * from df_addresses_array where postcode = 'WD4 9HW'").show()

+--------+-------------+
|postcode|address_array|
+--------+-------------+
+--------+-------------+



In [308]:
def split_field(col_name, num_cols=3):
    parts = [f"split({col_name}, ' \\\\| ')[{i-1}] as {col_name}_{i}" for i in range(1,num_cols+1)]
    return ", ".join(parts)

In [341]:
sql = f"""
select 
human, 
given_name, {split_field('given_name')},
given_namelabel, {split_field('given_namelabel')},
family_name, {split_field('family_name',2)},
family_name, {split_field('family_namelabel',2)}
from df_person

"""
df_split_names = spark.sql(sql)
df_split_names.createOrReplaceTempView("df_split_names")


In [342]:
# Create given name to variant lookups 
for i in range(1,4):
    sql = f"""
    select human, given_namelabel_{i}, array_distinct(collect_list(n.alt_name)) as given_name_{i}_variants
    from df_split_names as m
    left join df_names as n
    on m.given_name_{i} = n.given_name
    group by human, given_namelabel_{i}
    """
    df = spark.sql(sql)
    df.createOrReplaceTempView(f"gn_{i}")    

In [253]:
# Get points

point_array = """

array_union(
    coalesce(split(birth_coordinates, ' \\\\| '), array()) , 
    coalesce(split(residence_coordinates, ' \\\\| '), array())
    
)"""

sql = f"""
select human, place_birth, birth_coordinates, residence_coordinates,  {point_array}[0] as point_1, {point_array}[1] as point_2, {point_array}[3] as point_3
from df_person


"""
#  where human in ('Q105940615', 'Q52162298')
df_point_split = spark.sql(sql)

df_point_split.createOrReplaceTempView("df_point_split")
df_point_split.limit(2).toPandas()

Unnamed: 0,human,place_birth,birth_coordinates,residence_coordinates,point_1,point_2,point_3
0,Q10531,Q1394786,Point(-77.655833333 18.493611111),,Point(-77.655833333 18.493611111),,
1,Q38082,Q990170,Point(-2.631998 53.340406),Point(-1.0 53.0),Point(-2.631998 53.340406),Point(-1.0 53.0),


In [364]:
for i in [1,2]:
    sql = f"""

    with chosen_postcodes as (
    select human, point_{i}, 

    nearby_postcodes[FLOOR(RAND() * FLOOR(size(nearby_postcodes)))] as chosen_postcode_{i},
    nearby_postcodes[FLOOR(RAND() * FLOOR(size(nearby_postcodes)))] as alt_postcode_{i}

    from df_point_split as h
    left join df_point_postcode as p
    on h.point_{i} = p.point and h.human = p.person
    ) 

    select c.*, 
    aa.address_array[FLOOR(RAND() * FLOOR(size(aa.address_array)))] as chosen_address_point_{i}_a,
    ab.address_array[FLOOR(RAND() * FLOOR(size(ab.address_array)))] as chosen_address_point_{i}_b 

    from
    chosen_postcodes as c


    left join df_addresses_array as aa

    on aa.postcode == c.chosen_postcode_{i}

    left join df_addresses_array as ab

    on ab.postcode == c.alt_postcode_{i}


    """

    df = spark.sql(sql)
    df.createOrReplaceTempView(f"pp_{i}")    


In [373]:


sql = """
select p.human, 
substr(p.dob,1,10) as dob, 
p.humanlabel,
split(p.humanaltlabel, ", ") as humanaltlabel,
p.given_name,
gn_1.given_namelabel_1,  gn_1.given_name_1_variants,
gn_2.given_namelabel_2,  gn_2.given_name_2_variants,
gn_3.given_namelabel_3,  gn_3.given_name_3_variants,

split(p.country_citizenlabel, ' \\\\| ') as country_citizenship,
place_birthlabel as birth_place,
birth_countrylabel as birth_country,
sex_or_genderlabel as gender,



residencelabel as residence_place,
residence_countrylabel as residence_country,


pp_1.chosen_postcode_1, pp_1.alt_postcode_1, pp_1.chosen_address_point_1_a, pp_1.chosen_address_point_1_b,
pp_2.chosen_postcode_2, pp_2.alt_postcode_2, pp_2.chosen_address_point_2_a, pp_2.chosen_address_point_2_b

from df_person as p

left join  gn_1
on gn_1.human = p.human

left join  gn_2
on gn_2.human = p.human

left join  gn_3
on gn_3.human = p.human

left join pp_1
on pp_1.human = p.human

left join pp_2
on pp_2.human = p.human


limit 1000


"""

df_final = spark.sql(sql).toPandas()
df_final.sample(10)

Unnamed: 0,human,dob,humanlabel,humanaltlabel,given_name,given_namelabel_1,given_name_1_variants,given_namelabel_2,given_name_2_variants,given_namelabel_3,given_name_3_variants,country_citizenship,birth_place,birth_country,gender,residence_place,residence_country,chosen_postcode_1,alt_postcode_1,chosen_address_point_1_a,chosen_address_point_1_b,chosen_postcode_2,alt_postcode_2,chosen_address_point_2_a,chosen_address_point_2_b
166,Q351004,1958-12-11,Chris Hughton,"[Christopher ""Chris"" Hughton, Christopher Hugh...",Q339346,Chris,"[Kris, Chres, Christos, Khris, Christakis, Takis]",,[],,[],[United Kingdom],Stratford,United Kingdom,male,,,E15 4EX,E15 4EX,"{""house_number_paon"":""2"",""street"":""GLASIER COU...","{""house_number_paon"":""2"",""street"":""GLASIER COU...",,,,
261,Q2482522,1961-04-12,Magda Szubanski,,Q12795429,Magda,"[Magda, Magdalen, Madelaine, Majida, Magdalene...",,[],,[],"[United Kingdom, Australia]",Liverpool,United Kingdom,female,,,L3 0AZ,L3 0AQ,"{""flat_unit_saon"":""APARTMENT 1205"",""house_numb...","{""house_number_paon"":""7"",""street"":""JESSE HARTL...",,,,
527,Q7149394,1963-01-13,Paul Blackwell,,Q4925623,Paul,"[Pol, Пол, Pau, Pavel, Pawel, Páll, Paavo, Pab...",,[],,[],[United Kingdom],Mancot,United Kingdom,male,,,CH5 2FF,CH5 2DD,,"{""house_number_paon"":""6"",""street"":""FACTORY ROA...",,,,
589,Q5021365,1942-02-23,Stanley Cohen,[Stan Cohen],Q3541269,Stanley,[Stan],,[],,[],[United Kingdom],Johannesburg,South Africa,male,,,,,,,,,,
188,Q3809276,1928-12-01,John Francis Lane,,Q4925477,John,"[Johnny, Juan, Hans, Ivo, Ioan, Giovanni, Jack...",,[],,[],[United Kingdom],Whitstable,United Kingdom,male,,,,,,,,,,
452,Q16939894,1929-08-23,Anthony James Merrill Spencer,,Q12241622,Anthony,"[Tony, Thony, Ентоні, Anthony, Antón, Antonion...",,[],,[],[United Kingdom],Birmingham,United Kingdom,male,,,B5 5AE,B4 7PS,,,,,,
126,Q3295463,1971-06-20,Martin Hollis,,Q18002399,Martin,"[Mertinas, Môrcën, Mårten, Q18574672, Márton, ...",,[],,[],[United Kingdom],Leicester,United Kingdom,male,,,LE2 7JN,LE2 7JN,"{""house_number_paon"":""32"",""street"":""HAZEL STRE...","{""house_number_paon"":""34"",""street"":""HAZEL STRE...",,,,
767,Q21463001,1842-01-01,Vivian Crome,,Q650494,Vivian,"[Vivi, Vivan, Bibiano, Viviano, Viviann, Vivia...",,[],,[],[United Kingdom],,,male,,,,,,,,,,
114,Q15965511,1874-02-28,Edgar Ferdinand Cyriax,[Edgar F. Cyriax],Q2660560,Edgar,"[Eddie, Otger, Édgar, Edgarus, Edgár, Edgars, ...",,[],,[],[United Kingdom],Greater London,United Kingdom,male,,,EC4N 4SF,EC4N 4SF,,,,,,
299,Q6276805,1992-01-25,Jordan McKechnie,,Q14021944,Jordan,"[Jordan, Giordano]",,[],,[],[United Kingdom],Scotland,United Kingdom,male,,,,,,,,,,
