<h3>Define dataset path</h3>

In [5]:
import os

big_data_dir = 'D://Datasets/bigdata20/bigdata20'
followers = os.path.join(big_data_dir, 'followers.parquet')
followers_posts = os.path.join(big_data_dir, 'followers_posts_api_final.json')
followers_posts_likes = os.path.join(big_data_dir, 'followers_posts_likes.parquet')
posts = os.path.join(big_data_dir, 'posts_api.json')
posts_likes = os.path.join(big_data_dir, 'posts_likes.parquet')

<h3>Spark initialization</h3>

In [6]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext

conf = SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [13]:
from pyspark.sql.functions import col, isnan, when, trim, sort_array, explode

<h3>Define emojis</h3>

It use emoji library and csv file with classified emojis
(page: https://www.kaggle.com/thomasseleck/emoji-sentiment-data#Emoji_Sentiment_Data_v1.0.csv)

In [7]:
import emoji
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType
import pandas as pd

emojis_df = pd.read_csv('emoji_sentiment.csv')
emojis_df['Sentiment'] = emojis_df[['Negative', 'Neutral', 'Positive']].idxmax(axis=1)
emojis_df['unicode_emoji'] = [chr(int(emoji, 16)) for emoji in emojis_df['Unicode codepoint'].tolist()]
positive_emojis = emojis_df[emojis_df['Sentiment'] == 'Positive']['unicode_emoji'].values.tolist()
negative_emojis = emojis_df[emojis_df['Sentiment'] == 'Negative']['unicode_emoji'].values.tolist()
# neutral_emojis = emojis_df[emojis_df['Sentiment'] == 'Neutral']

unicode_positive_emojis = {}
unicode_negative_emojis = {}
unicode_neutral_emojis = {}
for key, value in emoji.UNICODE_EMOJI.items():
    if key in positive_emojis:
        unicode_positive_emojis[key] = value
    elif key in negative_emojis:
        unicode_negative_emojis[key] = value
    else:
        unicode_neutral_emojis[key] = value

<h3>Udf functions for extracting emojis (without brodcasing)</h3>

In [8]:
def extract_positive_emojis(_str):
    return ''.join(c for c in _str if c in unicode_positive_emojis)

def extract_negative_emojis(_str):
    return ''.join(c for c in _str if c in unicode_negative_emojis)

def extract_neutral_emojis(_str):
    return ''.join(c for c in _str if c in unicode_neutral_emojis)

extract_positive_emojis_udf = udf(lambda _str: extract_positive_emojis(_str), StringType())
extract_negative_emojis_udf = udf(lambda _str: extract_negative_emojis(_str), StringType())
extract_neutral_emojis_udf = udf(lambda _str: extract_neutral_emojis(_str), StringType())

In [9]:
print(extract_positive_emojis('t😍est😂test😍'))

😍😂😍


<h3>Udf functions for extracting emojis (with brodcasing)</h3>

In [10]:
def extract_emojis(_str, emojis_list):
    return ''.join(c for c in _str if c in emojis_list)

def udf_extract_emojis(emojis_list):
    return udf(lambda x : extract_emojis(x, emojis_list))


b_positive_emojis = sc.broadcast(sc.parallelize(unicode_positive_emojis).collect())
b_negative_emojis = sc.broadcast(sc.parallelize(unicode_negative_emojis).collect())
b_neutral_emojis = sc.broadcast(sc.parallelize(unicode_neutral_emojis).collect())
b_emojis = sc.broadcast(sc.parallelize(emoji.UNICODE_EMOJI).collect())

<h3>Load dataset</h3>

In [14]:
posts_df = sqlContext.read.json(posts)

In [15]:
posts_df.select('comments').show()

+-------------+
|     comments|
+-------------+
| [1, 0, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 2, true]|
| [1, 3, true]|
|[1, 27, true]|
| [1, 0, true]|
|[1, 23, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 0, true]|
| [1, 1, true]|
| [1, 0, true]|
| [1, 1, true]|
+-------------+
only showing top 20 rows



<h3>Add columns with smiles</h3>

In [16]:
processed_df = posts_df.filter(posts_df.text != '')
processed_df = processed_df.withColumn('positive_emojis', udf_extract_emojis(b_positive_emojis.value)(col('text')))
processed_df = processed_df.withColumn('negative_emojis', udf_extract_emojis(b_negative_emojis.value)(col('text')))
processed_df = processed_df.withColumn('neutral_emojis', udf_extract_emojis(b_neutral_emojis.value)(col('text')))
processed_df = processed_df.select('id', 'text', 'positive_emojis', 'negative_emojis', 'neutral_emojis')

<h3>Get only posts with emojis</h3>

In [17]:
with_emojis_df = processed_df.filter((processed_df.positive_emojis != '') | (processed_df.negative_emojis != '') | (processed_df.neutral_emojis != ''))
with_emojis_df.count()

739

In [18]:
with_emojis_df.show()

+-----+--------------------+---------------+---------------+--------------+
|   id|                text|positive_emojis|negative_emojis|neutral_emojis|
+-----+--------------------+---------------+---------------+--------------+
|17381|🎄Университет ИТМ...|           🎄🎄|               |              |
|42270|Фото на фоне Глав...|               |               |            📸|
|42273|⚡️Университет ИТМ...|               |               |             ⚡|
|42281|Акселератор Униве...|             🚀|               |              |
|42292|По-настоящему вол...|               |               |            🔮|
|42298|Первокурсники бак...|             🚀|               |              |
|42302|🏕 Без лишних сло...|               |               |            🏕|
|42313|20 сентября в Уни...|             💫|               |              |
|42315|Сегодня 256-й по ...|              ♥|               |              |
|42316|🔥 Держи актуальн...|               |               |            🔥|
|42321|Экватор сентября 

<h3>Save answer</h3>

In [19]:
answer_json = with_emojis_df.toPandas().to_json()
with open('task4_posts_emojis.json', 'w') as f:
    f.write(answer_json)