In [47]:
from os import environ
import findspark
import pandas
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, create_map, lit, sum, split, explode
from pyspark.sql.functions import to_timestamp, from_unixtime


environ['SPARK_HOME'] = '/usr/local/spark'
findspark.init()

sc = SparkContext("local", "first app")
sqlContext = SQLContext(sc)

In [49]:
import json
text_file = sc.textFile('AFINN-111.txt')
with open('dictionary.json', 'a') as file:
    for row in text_file.take(100):
        xrow = row.split('\t')
        dct = {'word':xrow[0], 'value': xrow[1]}
        data = json.dump(dct, file)
        file.write('\n')

In [52]:
raw_tweet_df = sqlContext.read.json('three_minutes_tweets.json')
affin_dict_df = sqlContext.read.json('dictionary.json')

In [65]:
raw_data_df = raw_tweet_df.select(
    col('id').alias('id'),
    col('user.screen_name').alias('username'),
    'timestamp_ms',
    col('lang').alias('lang_code'),
    col('user.time_zone').alias('timezone'),
    col('place.full_name').alias('place_fullname'),
    col('user.location').alias('user_location'),
    col('entities.user_mentions.screen_name').alias('user_mention'),
    col('entities.urls.display_url').alias('display_url'),
    'text',
).filter(raw_tweet_df.text.isNotNull())
raw_data_df.limit(3).toPandas()

Unnamed: 0,id,username,timestamp_ms,lang_code,timezone,place_fullname,user_location,user_mention,display_url,text
0,633030779619012608,snoow3333,1439761273661,ar,Nairobi,,,[],[],ايه الأكل 😜
1,633030779610664960,balwinderstyles,1439761273659,und,,,,"[nitishuna, LasVegasChicas, I_luv_reds, Dimond...",[],RT @nitishuna: @LasVegasChicas @I_luv_reds @Di...
2,633030779631566848,eqtybas,1439761273664,ar,Pacific Time (US & Canada),,,[],[],إنّ العرب إذا تغلبوا على أوطان أسرع إليها الخر...


In [71]:
splitted_text_df = raw_data_df.select(
    'id',
    'username',
    'lang_code',
    'timezone',
    'place_fullname',
    'user_location',
    'user_mention',
    'display_url',
    'text',
    'timestamp_ms',
).withColumn('create_dt', from_unixtime((raw_data_df.timestamp_ms / 1000).cast('bigint'), 'dd.MM.yyyy HH:mm:ss'))\
.withColumn('splitted_text', explode(split(col('text'), ' ')))
splitted_text_df.limit(3).toPandas()

Unnamed: 0,id,username,lang_code,timezone,place_fullname,user_location,user_mention,display_url,text,timestamp_ms,create_dt,splitted_text
0,633030779619012608,snoow3333,ar,Nairobi,,,[],[],ايه الأكل 😜,1439761273661,17.08.2015 00:41:13,ايه
1,633030779619012608,snoow3333,ar,Nairobi,,,[],[],ايه الأكل 😜,1439761273661,17.08.2015 00:41:13,الأكل
2,633030779619012608,snoow3333,ar,Nairobi,,,[],[],ايه الأكل 😜,1439761273661,17.08.2015 00:41:13,😜


In [98]:
joined_df = splitted_text_df.alias('SPLT')\
.join(affin_dict_df.alias('DICT'), col('SPLT.splitted_text') == col('DICT.word'), 'left')\
.select('SPLT.id',
        'SPLT.username',
        'SPLT.timezone',
        'SPLT.lang_code',
        'SPLT.place_fullname',
        'SPLT.user_location',
        'SPLT.user_mention',
        'SPLT.display_url',
        'SPLT.text',
        'SPLT.splitted_text',
        'SPLT.create_dt',
        'DICT.word',
        'DICT.value')
# join_df.show()
joined_df.limit(3).toPandas()
# inner (without any lang) or left join(only 'EN')

Unnamed: 0,id,username,timezone,lang_code,place_fullname,user_location,user_mention,display_url,text,splitted_text,create_dt,word,value
0,633030779619012608,snoow3333,Nairobi,ar,,,[],[],ايه الأكل 😜,ايه,17.08.2015 00:41:13,,
1,633030779619012608,snoow3333,Nairobi,ar,,,[],[],ايه الأكل 😜,الأكل,17.08.2015 00:41:13,,
2,633030779619012608,snoow3333,Nairobi,ar,,,[],[],ايه الأكل 😜,😜,17.08.2015 00:41:13,,


In [123]:
sqlContext.sql('show tables').show()

+--------+-----------------+-----------+
|database|        tableName|isTemporary|
+--------+-----------------+-----------+
|        |dt_most_happy_usr|       true|
|        |        dt_report|       true|
+--------+-----------------+-----------+



In [99]:
joined_df.createOrReplaceTempView('dt_report')

In [102]:
# timezone_report desc= 
sqlContext.sql('''
select 
  timezone
, sum(value) as happy_value 
    from dt_report 
    group by timezone 
    order by 2 desc''').limit(5).toPandas()

Unnamed: 0,timezone,happy_value
0,Pacific Time (US & Canada),12.0
1,Atlantic Time (Canada),12.0
2,Singapore,9.0
3,,9.0
4,London,6.0


In [104]:
# timezone_report asc= 
sqlContext.sql('''
select
  timezone
, sum(value) as happy_value
    from dt_report 
        where value is not null 
    group by timezone 
    order by 2 asc''').limit(5).toPandas()

Unnamed: 0,timezone,happy_value
0,International Date Line West,-6.0
1,Casablanca,3.0
2,Amsterdam,3.0
3,London,6.0
4,Singapore,9.0


In [109]:
# location_report desc= 
sqlContext.sql('''
select 
  user_location
, sum(value) as happy_value 
    from dt_report 
        where user_location <> "" 
    group by user_location 
    order by 2 desc''').limit(5).toPandas()

Unnamed: 0,user_location,happy_value
0,KJNWG. 5sos. avril. zac. ansel,9.0
1,my bed,9.0
2,California,9.0
3,Westside,6.0
4,East Riding of Yorkshire,6.0


In [113]:
# location_report asc= 
sqlContext.sql('''
select 
  user_location
, sum(value) as happy_value
    from dt_report 
        where user_location <> "" 
        and value is not null 
    group by user_location 
    order by 2 asc''').limit(5).toPandas()

Unnamed: 0,user_location,happy_value
0,Neverland,-6.0
1,was hsiaohsiaofishy,-6.0
2,In Arumins dick,-6.0
3,"Wheaton, Illinois",3.0
4,"Bruree, Limerick, Ireland.",3.0


In [116]:
# user_report desc= 
sqlContext.sql('''
select 
  username
, sum(value) as happy_value
    from dt_report 
        where value is not null 
    group by username 
    order by 2 desc''').limit(5).toPandas()

Unnamed: 0,username,happy_value
0,alohaamina,9.0
1,ammaarahf1,9.0
2,fringejokes,9.0
3,ilovehuanhuan,9.0
4,91TOMLINSLUT,9.0


In [118]:
# user_report asc= 
sqlContext.sql('''
select 
  username
, sum(value) as happy_value
    from dt_report 
        where value is not null 
    group by username 
    order by 2 asc''').limit(5).toPandas()

Unnamed: 0,username,happy_value
0,mikaylkymbre,-6.0
1,signaturelawley,-6.0
2,hunterstevens22,-6.0
3,CieeeeeI,-6.0
4,cln_marshmallo,-6.0


In [124]:
splitted_usermention_df = joined_df.withColumn('mentions', explode(col('user_mention')))
splitted_usermention_df.limit(3).toPandas()
splitted_usermention_df.createOrReplaceTempView('dt_most_happy_usr')

In [128]:
# sqlContext.sql('select * from dt_most_happy_usr').limit(5).toPandas()
sqlContext.sql('''
select
  username
, sum(value) as happy_value
    from dt_report
        where username in (select mentions from dt_most_happy_usr)
    group by username
    order by 2 desc
''').limit(3).toPandas()

Unnamed: 0,username,happy_value
0,1DftMarie,
1,1Dtomoonandback,
2,1Wampy,
