### Load Data

#### Connect to Spark & Elasticsearch, gather raw data

In [None]:
import findspark
import os

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars jars/elasticsearch-spark-20_2.11-6.5.1.jar pyspark-shell'

findspark.init()

sc = SparkContext(appName="esTest")
sqlContext = SQLContext(sc)

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("meetup") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
from json import loads, dumps

es_read_conf = {
"es.nodes" : 'elastic',
"es.port" : '9200',
"es.resource" : 'meetup-rawdata-*/default'
}

raw_data = sc.newAPIHadoopRDD(
inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=es_read_conf)

raw_data = raw_data.map(lambda v: loads(dumps(v[1])))

df = sqlContext.createDataFrame(raw_data)

#### Select only last response for each rsvp_id

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

rsvpWindowSpec = Window.partitionBy(df["rsvp_id"]).orderBy(df["mtime"].desc())

df = df\
    .withColumn("rowId", row_number().over(rsvpWindowSpec))\
    .where("rowId = 1")\
    .orderBy("rsvp_id")

#### Establish timezone, day_of_week_local, hour_local, minute_local of event.event_time based on venue.venue_geo

In [None]:
import sys
!{sys.executable} -m pip install tzwhere pytz

In [None]:
import pytz

from datetime import datetime
from tzwhere import tzwhere

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, StringType

def utc_epoch_to_local_by_coords(epoch_utc, lat, lon):
    t = tzwhere.tzwhere()
    
    lat = float(lat)
    lon = float(lon)
    
    epoch_utc = int(epoch_utc)
    
    # check if provided in ms or s:
    if len(str(epoch_utc)) == 13:
        epoch_utc = epoch_utc / 1000

    timezone_str = t.tzNameAt(lat, lon)

    # get time in UTC
    utc_dt = datetime.utcfromtimestamp(epoch_utc)

    # convert it to tz
    tz = pytz.timezone(timezone_str)
    dt = utc_dt.astimezone(tz)

    offset = dt.utcoffset().total_seconds()

    local_dt = datetime.utcfromtimestamp(epoch_utc + offset)

    return "_".join([str(x) for x in dict(month_local=local_dt.month, day_local=local_dt.day, weekday_local=local_dt.weekday(),
                     hour_local=local_dt.hour, minute_local=local_dt.minute).values()])

udf_utc_epoch_to_local_by_coords = udf(utc_epoch_to_local_by_coords, StringType())

In [None]:
df \
.withColumn("local", udf_utc_epoch_to_local_by_coords(df.event.time, df.venue.lat, df.venue.lon)) \
.collect()

### Analyze

#### Calculate & visualize most distinguishable distributions of meetings in particular day_of_week_local by tag (Jensen–Shannon divergence)

#### Calculate 'New Years Resolutions Effect' to establish which tags gained most interest inbetween december/january