In [None]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import pandas as pd 
from pyspark.sql.functions import expr
pd.options.display.max_columns = 1000
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

### Person data

In [None]:
df_person = spark.read.parquet("scrape_wikidata/processed_data/step_1_one_line_per_person/page000_0_to_2000.parquet")

for col in ["humanlabel", "humanaltlabel", "birth_name", "birth_name", "given_namelabel", "family_namelabel"]:
    df_person = df_person.withColumn(col, expr(f'lower({col})'))

df_person.createOrReplaceTempView("df_person")
# spark.sql("select * from df_person where human = 'Q38082'").toPandas() lewis caroll
df_person.toPandas().sample(2)

### Names

In [None]:
from master_data.names import get_df_given_names_with_freqs, get_df_family_names_with_freqs
df_given_names = get_df_given_names_with_freqs(spark)
df_family_names = get_df_family_names_with_freqs(spark)
df_given_names.createOrReplaceTempView("df_given_names")
df_family_names.createOrReplaceTempView("df_family_names")
display(df_given_names.limit(2).toPandas())
display(df_family_names.limit(2).toPandas())

### Person postcode lookup

In [None]:
from master_data.utils import get_person_nearby_postcodes_lookup
df_point_postcode = get_person_nearby_postcodes_lookup(spark)
df_point_postcode.createOrReplaceTempView("df_point_postcode")
df_point_postcode.limit(2).toPandas()

## Master data

#### Get list of given names and family names

In [None]:
from master_data.names import get_df_filter, name_split
df_person = name_split(df_person, spark)
df_person.limit(5).toPandas()

In [None]:
remove_dupe_country_citizen = """
array_distinct(
    transform(
        split(h.country_citizenlabel, ' \\\\| '), 
            x -> case 
                    when x = 'United Kingdom of Great Britain and Ireland' then 'United Kingdom' 
                    else x 
                    end
    )
)
"""

sql = f"""
select 
h.human, 

h.humanlabel,
split(h.humanaltlabel, ", ") as humanaltlabel,
substr(h.dob,1,10) as dob, 


{remove_dupe_country_citizen}  as country_citizenship,


place_birthlabel as birth_place,
birth_countrylabel as birth_country,
sex_or_genderlabel as gender,



residencelabel as residence_place,
residence_countrylabel as residence_country,

pc.nearby_postcodes[0][0].postcode as fake_postcode,
pc.nearby_postcodes[0][0].lat as fake_lat,
pc.nearby_postcodes[0][0].lng as fake_lng,

pc.nearby_postcodes,
h.given_name_1 as given_name_1,
n1.alt_names as alt_given_name_1,
h.given_name_2 as given_name_2,
n2.alt_names as alt_given_name_2,
h.given_name_3 as given_name_3,
n3.alt_names as alt_given_name_3,
h.family_name_1 as family_name_1,
n4.alt_names as alt_family_name_1,
h.family_name_2 as family_name_2,
n5.alt_names as alt_family_name_2

from df_person as h

left join df_given_names as n1
on lower(h.given_name_1) = n1.original_name

left join df_given_names as n2
on lower(h.given_name_2) = n2.original_name

left join df_given_names as n3
on lower(h.given_name_3) = n3.original_name

left join df_family_names as n4
on lower(h.family_name_1) = n4.original_name

left join df_family_names as n5
on lower(h.family_name_2) = n5.original_name

left join df_point_postcode as pc
on h.human = pc.person

"""
df_final = spark.sql(sql)
df_final.createOrReplaceTempView("df_final")  
df_final = df_final.repartition(1)
df_final.write.mode('overwrite').parquet("scrape_wikidata/clean_data/master_data/")
# df_final.limit(10).toPandas()

In [None]:
df_final.limit(10).toPandas()