In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as f
import re

import findspark
findspark.init()

In [2]:
re_name = '(\/[gm]\..+\t<http:\/\/rdf\.freebase\.com\/ns\/type\.object\.name>\t\".*\"@en)'
re_person = '(\/[gm]\..+\t<http:\/\/rdf\.freebase\.com\/ns\/people\.person\..*>\t)'
re_dec_person = '(\/[gm]\..+\t<http:\/\/rdf\.freebase\.com\/ns\/people\.deceased_person\..*>\t)'
re_nationality = '(\/[gm]\..+\t<http:\/\/rdf\.freebase\.com\/ns\/people\.person\..*>\t)'
re_nationality_value = '(<http:\/\/rdf\.freebase\.com\/key\/wikipedia\.en_title>)'

pathFile = '../data/freebase-head-10000000.gz'

In [3]:
sc = SparkSession.builder.master('local[*]').appName('IR Person entity, FREEBASE').config("spark.driver.memory", "15g").getOrCreate()

22/11/12 14:33:16 WARN Utils: Your hostname, martinmacbook.local resolves to a loopback address: 127.0.0.1; using 10.10.60.222 instead (on interface en0)
22/11/12 14:33:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/12 14:33:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
freebase = sc.sparkContext.textFile(pathFile)
filtered_data = freebase \
    .filter(lambda x: re.search(re_name,x) or re.search(re_person,x) or re.search(re_dec_person,x) or re.search(re_nationality_value, x)) \
    .distinct() \
    .map(lambda x: re.sub('(http\:\/\/rdf.freebase.com\/ns\/)|(\^\^.*\.)|(\@.*\.)|\<|\>|\"|(\t\.)',"",x)) \
    .map(lambda x: x.split('\t')) 

In [5]:
filtered_data.collect()



[['g.112yf8_qt', 'people.person.nationality', 'm.07ylj'],
 ['g.112yf8_qt', 'people.deceased_person.date_of_death', '2001-10-13'],
 ['g.112yf8_qt', 'people.person.date_of_birth', '1933-01-25'],
 ['g.112yfhv9p', 'people.person.date_of_birth', '1887-04-14'],
 ['g.112yfhv9p', 'people.deceased_person.date_of_death', '1963-01-15'],
 ['g.112yfj_hx', 'people.person.date_of_birth', '1990-07-07'],
 ['g.112yfjql4', 'people.deceased_person.date_of_death', '1810'],
 ['g.112yfjql4', 'people.person.date_of_birth', '1735'],
 ['g.112yfl85l', 'people.person.date_of_birth', '1961-08-12'],
 ['g.112yfmpf8', 'people.person.date_of_birth', '1865'],
 ['g.112yfmpf8', 'people.deceased_person.date_of_death', '1936'],
 ['g.112yfpbtx', 'people.person.date_of_birth', '1922-08-09'],
 ['g.112yfqr32', 'people.deceased_person.date_of_death', '1925'],
 ['g.112yfqr32', 'people.person.date_of_birth', '1854'],
 ['g.112yfsjp0', 'people.person.nationality', 'm.0fv4v'],
 ['g.112yfsjp0', 'people.person.height_meters', '1.69'],

In [6]:
schema = StructType([StructField('subject', StringType(), True),
                    StructField('predicate', StringType(), True),
                    StructField('object', StringType(), True, metadata = {"maxlength":2048})])

In [7]:
names = sc.createDataFrame(filtered_data.filter(lambda x: "type.object.name" in x[1]), schema)
births = sc.createDataFrame(filtered_data.filter(lambda x: "people.person.date_of_birth" in x[1]), schema)
deaths = sc.createDataFrame(filtered_data.filter(lambda x: "people.deceased_person.date_of_death" in x[1]), schema)
nationality = sc.createDataFrame(filtered_data.filter(lambda x: "people.person.nationality" in x[1]), schema)
height_meters = sc.createDataFrame(filtered_data.filter(lambda x: "people.person.height_meters" in x[1]), schema)
nationality_value = sc.createDataFrame(filtered_data.filter(lambda x: "type.object.name" in x[1]), schema)

In [8]:
births = births.withColumn("note", f.lit(""))
deaths = deaths.withColumn("note", f.lit(""))
nationality = nationality.withColumn("note", f.lit(""))
height_meters = height_meters.withColumn("note", f.lit(""))
nationality_value = nationality_value.withColumn("note", f.lit(""))

In [9]:
names.registerTempTable("names")
births.registerTempTable("births")
deaths.registerTempTable("deaths")
nationality.registerTempTable("nationality")
height_meters.registerTempTable("height_meters")
nationality_value.registerTempTable("nationality_value")



In [10]:
sql_context = SQLContext(sc.sparkContext)



In [11]:
people = sql_context.sql("""
    select names.subject as id, names.object as name,
    case
        when births.object is not null then (cast(births.object as date)) 
        when deaths.object is not null and births.object is null then (cast(deaths.object as date) - 100*365)
        when deaths.object is null and births.object is null then ''
    end as birth,
    case
        when deaths.object is not null then (cast(deaths.object as date))
        when births.object is not null and deaths.object is null then (cast(births.object as date) + 100*365)
        when deaths.object is null and births.object is null then ''
    end as death,
    ifnull(nationality.object, '') as nationality_ref,
    COALESCE(nationality_value.object, '') as nationality,
    ifnull(height_meters.object, '') as height_meters,
    ifnull(births.note, 'Datum narodenia nemusi byt spravny.') as birth_validation,
    ifnull(deaths.note, 'Datum umrtia nemusi byt spravny.') as death_validation
    from names
    left join births on names.subject = births.subject
    left join deaths on names.subject = deaths.subject
    left join nationality on names.subject = nationality.subject
    left join height_meters on names.subject = height_meters.subject
    inner join nationality_value on nationality.subject = nationality_value.subject
    where births.object is not null or deaths.object is not null or nationality.object is not null or height_meters.object is not null or nationality_value.object is not null
    """)

people = people.drop_duplicates()

In [12]:
people.count()



2438

In [13]:
people.head()



Row(id='m.02pzbtf', name='Wolfgang de Beer', birth='1964-01-02', death='2063-12-08', nationality_ref='m.0345h', nationality='Wolfgang de Beer', height_meters='1.81', birth_validation='', death_validation='Datum umrtia nemusi byt spravny.')

In [14]:
people.repartition(1).write.mode("overwrite").format('com.databricks.spark.csv') \
    .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false").save('outputs', header = 'true')

