### Load Data

#### Connect to Spark & Elasticsearch, gather raw data

In [None]:
import sys
!{sys.executable} -m pip install findspark plotly colorlover scipy numpy

In [None]:
#import findspark
import os

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

# os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars jars/elasticsearch-spark-20_2.11-6.5.1.jar pyspark-shell'

#findspark.init()

sc = SparkContext(appName="esAnalytics")
sqlContext = SQLContext(sc)

# spark = SparkSession.builder \
#     .master("local[*]") \
#     .appName("meetup") \
#     .config("spark.some.config.option", "some-value") \
#     .config("spark.sql.crossJoin.enabled", "true") \
#     .getOrCreate()

spark = SparkSession.builder \
    .master("yarn") \
    .appName("meetup") \
    .config("spark.some.config.option", "some-value") \
    .config("spark.sql.crossJoin.enabled", "true") \
    .getOrCreate()

In [None]:
spark

In [None]:
# from json import loads, dumps

# es_read_conf = {
# "es.nodes" : 'elastic',
# "es.port" : '9200',
# "es.resource" : 'meetup-rawdata-*/default'
# }

# raw_data = sc.newAPIHadoopRDD(
# inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
# keyClass="org.apache.hadoop.io.NullWritable",
# valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
# conf=es_read_conf)

# raw_data = raw_data.map(lambda v: loads(dumps(v[1])))

# df = sqlContext.createDataFrame(raw_data)

In [None]:
#df = spark.read.json("./data/meetup-rawdata/*json")
df = spark.read.json("gs://pw-bd-project-meetup-rawdata/*json")
# add - select only columns used later on
df.cache()
df.count()

#### Select only last response for each rsvp_id

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

rsvpWindowSpec = Window.partitionBy(df["rsvp_id"]).orderBy(df["mtime"].desc())

df = df \
    .withColumn("rowId", row_number().over(rsvpWindowSpec)) \
    .where("rowId = 1") \
    .sort("rsvp_id")

#### Establish timezone, day_of_week_local, hour_local, minute_local of event.event_time based on venue.venue_geo

In [None]:
# import pytz

# from datetime import datetime
# from tzwhere import tzwhere

# from pyspark.sql.functions import udf
# from pyspark.sql.types import StringType, ArrayType

# @udf(StringType())
# def udf_timezone_by_geo(lat, lon):
#     t = tzwhere.tzwhere()
    
#     return t.tzNameAt(float(lat), float(lon))

# @udf(ArrayType(StringType()))
# def udf_localize_with_timezone(utc_time, timezone_str):
    
#     epoch_utc = int(utc_time)
    
#     timezone_str = timezone_str.strip()
    
#     # check if provided in ms or s:
#     if len(str(epoch_utc)) == 13:
#         epoch_utc = epoch_utc / 1000

#     # get time in UTC
#     utc_dt = datetime.utcfromtimestamp(epoch_utc)

#     # convert it to tz
#     tz = pytz.timezone(timezone_str)
#     dt = utc_dt.astimezone(tz)

#     offset = dt.utcoffset().total_seconds()

#     local_dt = datetime.utcfromtimestamp(epoch_utc + offset)
    
#     parts = dict(year_local=local_dt.year,
#                  month_local=local_dt.month, 
#                  day_local=local_dt.day, 
#                  weekday_local=local_dt.isoweekday(),
#                  hour_local=local_dt.hour, 
#                  minute_local=local_dt.minute)
    
#     return list(parts.values())

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType

from datetime import datetime

@udf(ArrayType(StringType()))
def udf_extract_date_parts(utc_time):
    try:
        epoch_utc = int(utc_time)
        
        epoch_utc_len = len(str(epoch_utc))
        
        # check if provided in ms or s:
        if epoch_utc_len > 10:
            epoch_utc = epoch_utc / (10**(epoch_utc_len-10))

        # get time in UTC
        utc_dt = datetime.utcfromtimestamp(epoch_utc)
    
        parts = dict(year_local=utc_dt.year,
                     month_local=utc_dt.month, 
                     day_local=utc_dt.day, 
                     weekday_local=utc_dt.isoweekday(),
                     hour_local=utc_dt.hour, 
                     minute_local=utc_dt.minute)
        
    except Exception:
        parts = {i:-1 for i in range(6)}
    finally:
        return list(parts.values())

In [None]:
# # to optimize matching event.time with venue.lat/venue.lon create dict with distinct venues

# from pyspark.sql.functions import concat, lit

# venueGeoDict = df \
#     .select(col("venue.lat"), col("venue.lon")) \
#     .distinct() \
#     .withColumn("key", concat(col("lat"), lit("_"), col("lon"))) \
#     .withColumn("event_timezone", udf_timezone_by_geo(col("lat"), col("lon"))) \
#     .select(col("key"), col("event_timezone"))

# venueGeoDict.show(5)

In [None]:
# dfWithEventTimezone = df \
#     .join(venueGeoDict, concat(col("venue.lat"), lit("_"), col("venue.lon")) == venueGeoDict.key, 'cross') 

dfWithEventTimeParts = df \
    .where(col("event.time").isNotNull()) \
    .withColumn("event_time_parts", udf_extract_date_parts(df.event.time))

dfWithEventTimeParts = dfWithEventTimeParts.where(dfWithEventTimeParts.event_time_parts[1] > 0)

dfWithEventTimeParts.cache()

In [None]:
dfWithEventTimeParts \
    .select(col("event.time"), col("event_time_parts")) \
    .show(5)

### Analyze

#### Calculate & visualize most distinguishable distributions of meetings in particular day_of_week_local by tag (Jensen–Shannon divergence)

In [None]:
# total distribution

from pyspark.sql.functions import lit, count, udf, collect_list
from pyspark.sql.types import StringType, DoubleType, MapType

countByAllWindowSpec = Window.partitionBy(lit(1))

totalWeekdayDistribution = dfWithEventTimeParts \
    .withColumn("event_isoweekday", dfWithEventTimeParts.event_time_parts[3]) \
    .select(col("rsvp_id"), col("event_isoweekday"), count(col("rsvp_id")).over(countByAllWindowSpec).alias("count_all")) \
    .groupBy(col("event_isoweekday"), col("count_all")) \
    .count() \
    .sort("event_isoweekday") \
    .withColumn("weekday_total_dist", col("count")/col("count_all")) \
    .groupBy() \
    .agg(collect_list(col("weekday_total_dist")).alias("weekday_total_dist"))

totalWeekdayDistribution.show()

In [None]:
totalWeekdayDistribution.collect()

In [None]:
# distribution by group topic
from pyspark.sql.functions import explode, lower, coalesce, abs, max, rank
from pyspark.sql.types import Row

countByTopicWindowSpec = Window.partitionBy("group_topic")

topicWeekdayDistributionTmp = dfWithEventTimeParts \
    .withColumn("event_isoweekday", dfWithEventTimeParts.event_time_parts[3]) \
    .select(col("rsvp_id"), col("event_isoweekday"), explode(col("group.group_topics")).alias("group_topic_map")) \
    .withColumn("group_topic", col("group_topic_map")) \
    .withColumn("total_topic_count", count("rsvp_id").over(countByTopicWindowSpec)) \
    .drop("group_topic_map") \
    .groupBy(col("event_isoweekday"), col("group_topic"), col("total_topic_count")) \
    .count() \
    .sort(col("group_topic"), col("event_isoweekday"))

# topicWeekdayDistributionTmp.show(30)

topics = topicWeekdayDistributionTmp.select(col("group_topic").alias("group_topic_tmp")).distinct()
weekdays = sc.parallelize(list(range(7))).map(lambda x: Row(event_isoweekday_tmp=str(1 + int(x)))).toDF()
cross = weekdays.crossJoin(topics).withColumn("count_tmp", lit(0))

# ensure that every topic has entry for every weekday (even if no meetings took place on that weekday)
topicWeekdayDistribution = cross \
    .join(topicWeekdayDistributionTmp, (topicWeekdayDistributionTmp.event_isoweekday == cross.event_isoweekday_tmp) & (topicWeekdayDistributionTmp.group_topic == cross.group_topic_tmp), how='outer') \
    .withColumn("event_isoweekday", col("event_isoweekday_tmp")) \
    .withColumn("group_topic", col("group_topic_tmp")) \
    .withColumn("count", coalesce("count", "count_tmp")) \
    .withColumn("total_topic_count", coalesce("total_topic_count", lit(-1))) \
    .drop("event_isoweekday_tmp", "group_topic_tmp", "count_tmp") \
    .sort("group_topic", "event_isoweekday") \
    .withColumn("weekday_topic_dist", abs(col("count")/col("total_topic_count"))) \
    .groupBy("group_topic") \
    .agg(collect_list(col("weekday_topic_dist")).alias("weekday_topic_dist"), max(col("total_topic_count")).alias("topic_count")) \
    .withColumn("topic_count_rank", rank().over(Window.partitionBy(lit(1)).orderBy(col("topic_count"))))

topicWeekdayDistribution.show(20)
topicWeekdayDistribution.cache()

In [None]:
# JS Divergence UDF

from numpy import asarray, e
from scipy import stats

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

@udf(DoubleType())
def udf_jsd(p, q, base=e):
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    try:
        ## convert to np.array
        p, q = asarray(p), asarray(q)
        ## normalize p, q to probabilities
        p, q = p/p.sum(), q/q.sum()

        m = 1./2*(p + q)

        return float(stats.entropy(p,m, base=base)/2. +  stats.entropy(q, m, base=base)/2.)
    except:
        return -1

In [None]:
h = topicWeekdayDistribution.groupBy("topic_count").count().orderBy("topic_count").repartition(1)
#h.write.format("com.databricks.spark.csv").csv("gs://pw-bd-project-meetup-rawdata/hist")
#h.show()

In [None]:
# calculate Jensen-Shannon Divergence per topic & select 10 highest
jsDivergence = topicWeekdayDistribution \
    .crossJoin(totalWeekdayDistribution) \
    .withColumn("jsd", udf_jsd(col("weekday_topic_dist"), col("weekday_total_dist"))) \
    .sort(col("jsd").desc()) \
    .where(topicWeekdayDistribution.topic_count > 1000)

jsDivergence.cache()
jsDivergence.count()

In [None]:
jsDivergence.sort(col("jsd").desc()).show(100)

In [62]:
def plot_hist(data_list):
    import plotly.graph_objs as go
    from plotly.offline import init_notebook_mode, iplot
    from math import ceil
    import plotly
    
    init_notebook_mode(connected=True)
    
    from json import loads
    # import colorlover as cl

    no = len(data_list)
    
    colsNo = 5
    rowsNo = ceil(no/colsNo)
    
#     color = cl.scales['11']['div']['RdBu']
    
#     select = [1,2,-1,-2,3]
#     color = [color[i] for i in select]

    color = ['rgb(178,24,43)', 'rgb(214,96,77)', 'rgb(5,48,97)', 'rgb(33,102,172)', 'rgb(244,165,130)']
    
    fig = plotly.tools.make_subplots(rows=rowsNo, 
                              cols=colsNo, 
                              subplot_titles=['<b>{}</b><br>(JSD: {:.2f} Count: {})'.format(loads(x).get('group_topic', ''),
                                                                    loads(x).get('jsd', ''), loads(x).get('topic_count', ''))
                                                                     for x in data_list],
                              
                              shared_yaxes=True
                             )

    fig['layout'].update(height=rowsNo*300, 
                         width=colsNo*300, 
                         title='Most characteristic weekday dist', 
                         showlegend=False)

    i = 0
    
    rows = [x+1 for x in range(rowsNo)]
    cols = [x+1 for x in range(colsNo)]
    
    combos = [(i,j) for i in rows for j in cols]
    
    ci = 0
    
    for data in data_list:
        ci += 1

        c = color[(ci % colsNo)]
        combo = combos[i]
        
        cur_row = combo[0]
        cur_col = combo[1]
        
        data = loads(data)

        x = [x+1 for x in range(7)]
        y1 = data.get('weekday_topic_dist', [0 for x in range(6)])
        y2 = data.get('weekday_total_dist', [0 for x in range(6)])

        title = data.get('group_topic', 'na')

        fig.append_trace(go.Bar(x=x,y=y1,marker=dict(color=[c for i in range(7)])), cur_row, cur_col)
        fig.append_trace(go.Scatter(x=x,y=y2,mode='lines',line=dict(color='gray',width=3,shape='spline')), cur_row, cur_col)
        
        i += 1
        
    plotly.offline.plot(fig, filename='plotly-distribution.html')
    
entries = jsDivergence.toJSON().take(200)

plot_hist(entries)

This is the format of your plot grid:
[ (1,1) x1,y1 ]      [ (1,2) x2,y1 ]      [ (1,3) x3,y1 ]      [ (1,4) x4,y1 ]      [ (1,5) x5,y1 ]    
[ (2,1) x6,y2 ]      [ (2,2) x7,y2 ]      [ (2,3) x8,y2 ]      [ (2,4) x9,y2 ]      [ (2,5) x10,y2 ]   
[ (3,1) x11,y3 ]     [ (3,2) x12,y3 ]     [ (3,3) x13,y3 ]     [ (3,4) x14,y3 ]     [ (3,5) x15,y3 ]   
[ (4,1) x16,y4 ]     [ (4,2) x17,y4 ]     [ (4,3) x18,y4 ]     [ (4,4) x19,y4 ]     [ (4,5) x20,y4 ]   
[ (5,1) x21,y5 ]     [ (5,2) x22,y5 ]     [ (5,3) x23,y5 ]     [ (5,4) x24,y5 ]     [ (5,5) x25,y5 ]   
[ (6,1) x26,y6 ]     [ (6,2) x27,y6 ]     [ (6,3) x28,y6 ]     [ (6,4) x29,y6 ]     [ (6,5) x30,y6 ]   
[ (7,1) x31,y7 ]     [ (7,2) x32,y7 ]     [ (7,3) x33,y7 ]     [ (7,4) x34,y7 ]     [ (7,5) x35,y7 ]   
[ (8,1) x36,y8 ]     [ (8,2) x37,y8 ]     [ (8,3) x38,y8 ]     [ (8,4) x39,y8 ]     [ (8,5) x40,y8 ]   
[ (9,1) x41,y9 ]     [ (9,2) x42,y9 ]     [ (9,3) x43,y9 ]     [ (9,4) x44,y9 ]     [ (9,5) x45,y9 ]   
[ (10,1) x46,y10 ]   [ (10

In [65]:
! gsutil cp plotly-distribution.html gs://pw-bd-project-meetup-rawdata/

Copying file://plotly-distribution.html [Content-Type=text/html]...
Copying file://plotly-distribution.html [Content-Type=text/html]...             
/ [1 files][  3.0 MiB/  3.0 MiB]                                                
Operation completed over 1 objects/3.0 MiB.                                      
/ [1 files][  3.0 MiB/  3.0 MiB]                                                
Operation completed over 1 objects/3.0 MiB.                                      


#### Calculate 'New Years Resolutions Effect' to establish which tags gained most interest inbetween december/january

In [83]:
# all available topics (if occured in one month and not other)
t = df \
    .select(explode(col("group.group_topics")).alias("topic")) \
    .select(col("topic").alias("topic")) \
    .distinct()

# select topics from january, 2019
m1 = dfWithEventTimeParts \
    .where((col("event_time_parts")[0] == '2019') & (col("event_time_parts")[1] == '1')) \
    .select(explode("group.group_topics").alias("topic")) \
    .withColumn("topic_m1", col("topic")) \
    .groupBy("topic_m1") \
    .count() \
    .alias("m1")

# select topics from december, 2018
m2 = dfWithEventTimeParts \
    .where((col("event_time_parts")[0] == '2018') & (col("event_time_parts")[1] == '12')) \
    .select(explode("group.group_topics").alias("topic")) \
    .withColumn("topic_m2", col("topic")) \
    .groupBy("topic_m2") \
    .count() \
    .alias("m2")

# calculate increase in interest per topic & select 10 highest
increase = t \
    .join(m2, t.topic == m2.topic_m2, how='full') \
    .join(m1, t.topic == m1.topic_m1, how='full') \
    .withColumn("m1", coalesce(col("m1.count"), lit("0"))) \
    .withColumn("m2", coalesce(col("m2.count"), lit("0"))) \
    .select(col("topic"), col("m1"), col("m2")) \
    .withColumn("change", col("m1")/col("m2")) \
    .withColumn("change_importance", ((col("m1")-col("m2"))/1)*(col("m1")/col("m2"))) \
    .sort(col("change_importance").desc()) \
    .repartition(1)

increase.cache()

increase.limit(20).show()

# # show topics that were absent in december, 2018 but appeared in january, 2019
# new_topics = increase \
#     .where((col("m1") > 0) & (col("m2") == 0))

increase.show()

increase.count()

! gsutil rm -fR gs://pw-bd-project-meetup-rawdata/increase-topics
increase.write.format("com.databricks.spark.csv").csv("gs://pw-bd-project-meetup-rawdata/increase-topics")

+--------------------+------+------+------------------+------------------+
|               topic|    m1|    m2|            change| change_importance|
+--------------------+------+------+------------------+------------------+
|    Entrepreneurship|255353|127155|2.0082025873933387|257447.55529865122|
|    Self-Improvement|317050|179649|1.7648303079894683|242489.44914806093|
|   Social Networking|449856|302186| 1.488672539429358| 219832.2738975333|
|Software Development|254395|137346|1.8522199408792392|216800.49185997408|
|            RabbitMQ|   442|     1|             442.0|          194922.0|
|Professional Netw...|219587|118552| 1.852242054119711|187141.27593798502|
|              Social|453261|321181|1.4112322958082826|186395.56163035796|
|         New In Town|439804|310670|1.4156629220716517|182810.21577880066|
|           Fun Times|398135|275139|1.4470322273469047| 177979.1758347599|
|Computer programming|194896|104110|1.8720199788685046|169953.20580155606|
|   Business Strategy|151

Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00004-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420562419724...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/#1548420561239512...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/_SUCCESS#1548420586939630...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00000-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420562379019...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00001-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420562413969...
/ [4 objects]                                                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m rm ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00002-50d57b9a-c883-4337-97

Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00030-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420566004270...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00031-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420565801694...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00032-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420566933073...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00033-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420566771929...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00031-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420565801694...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00032-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420566933073...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00033-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420566771929...
Removing gs://pw-bd-project-meetup-rawdata/incre

Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00061-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420570232458...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00061-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420570232458...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00062-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420570395254...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00062-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420570395254...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00063-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420570564309...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00064-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420571536832...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00063-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420570564309...
Removing gs://pw-bd-project-meetup-rawdata/incre

Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00091-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420574992247...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00092-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420574802034...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00092-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420574802034...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00093-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420575110074...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00094-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420574780076...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00095-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420574871100...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00093-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420575110074...
Removing gs://pw-bd-project-meetup-rawdata/incre

Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00122-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420579115114...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00122-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420579115114...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00123-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420579439182...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00123-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420579439182...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00124-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420579507487...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00125-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420579817859...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00124-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420579507487...
Removing gs://pw-bd-project-meetup-rawdata/incre

Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00152-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420582982674...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00153-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420582719249...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00154-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420584065426...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00154-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420584065426...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00155-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420583384044...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00155-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420583384044...
Removing gs://pw-bd-project-meetup-rawdata/increase-topics/part-00156-50d57b9a-c883-4337-9779-495933bf92e5-c000.csv#1548420583781906...
Removing gs://pw-bd-project-meetup-rawdata/incre