In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType,StringType, DoubleType,NumericType,IntegerType
from pyspark.sql.functions import explode
from pyspark.sql.functions import col
from pyspark.sql.functions import udf, array
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import lit
import re
import os

In [3]:
# combine all with only cols needed
def union_df(df_1,df_2):
    select_1 = df_1.select("created_at", "id", "truncated", "lang",
                       col("user.id").alias("user_id"), "user.followers_count", "user.friends_count", "user.listed_count",
                       "text", col("entities.hashtags.text").alias('hashtags'), 
                       "entities.urls.display_url", "entities.urls.expanded_url", "entities.urls.url", 
                       col("entities.user_mentions.screen_name").alias("mentions_screen_name"),
                       col("extended_tweet.full_text").alias("et_full_text"), 
                       col("extended_tweet.entities.hashtags.text").alias("et_hashtags"), 
                       col("extended_tweet.entities.urls.display_url").alias("et_display_url"), 
                       col("extended_tweet.entities.urls.expanded_url").alias("et_expanded_url"), 
                       col("extended_tweet.entities.urls.url").alias("et_url"), 
                       col("extended_tweet.entities.user_mentions.screen_name").alias("et_mentions_screen_name"),
                       "place.country", "place.country_code", "place.name", "place.place_type",
                       "favorite_count", "reply_count", "retweet_count", "quote_count",'place')
    select_2 = df_2.select("created_at", "id", "truncated", "lang",
                       col("user.id").alias("user_id"), "user.followers_count", "user.friends_count", "user.listed_count",
                       "text", col("entities.hashtags.text").alias('hashtags'), 
                       "entities.urls.display_url", "entities.urls.expanded_url", "entities.urls.url", 
                       col("entities.user_mentions.screen_name").alias("mentions_screen_name"),
                       col("extended_tweet.full_text").alias("et_full_text"), 
                       col("extended_tweet.entities.hashtags.text").alias("et_hashtags"), 
                       col("extended_tweet.entities.urls.display_url").alias("et_display_url"), 
                       col("extended_tweet.entities.urls.expanded_url").alias("et_expanded_url"), 
                       col("extended_tweet.entities.urls.url").alias("et_url"), 
                       col("extended_tweet.entities.user_mentions.screen_name").alias("et_mentions_screen_name"),
                       "place.country", "place.country_code", "place.name", "place.place_type",
                       "favorite_count", "reply_count", "retweet_count", "quote_count",'place')

    result = select_1.union(select_2)
    return result

In [4]:
# filter songs and artists

song_name = {
    'Lemonade','ROCKSTAR','WHATS POPPIN','Come & Go','POPSTAR','Smile','Hit Different','Savage Love',
    'Wolves',"my ex's best friend","Go Crazy","GREECE","Roses","Ice Cream","Hawái","you broke me first",
    "Lithuania","Be Like That","Lucid Dreams","Blastoff","DOLLAZ ON MY HEAD","Robbery","All Girls Are The Same",
    "Sunflower - Spider-Man: Into the Spider-Verse","Sunflower","Hate The Other Side","We Paid",
    "Lets Link","death bed","Electric Love","Breaking Me","Rain On Me","Still Don't Know My Name",
    "Girls in the Hood","Someone You Loved","GO","Rags2Riches 2","cardigan","Flex","Over Now","UN DIA",
    "goosebumps","Sunday Best","hot girl bummer","Whiskey Glasses","THE SCOTTS","Savage Remix","Dior",
    "Mad at Disney","ZTFO","Prospect","my future","HIGHEST IN THE ROOM","exile","Bandit","Del Mar",
    "ROXANNE","Falling","Are You Bored Yet?","the 1","Stunnin'","Ballin'","Walk Em Down","Deep Reverence",
    "Money Trees","Chicago Freestyle","My Window","Conversations","OK Not To Be OK","Righteous",
    "SICKO MODE","What You Know Bout Love","Toosie Slide","Head & Heart","Put Your Records On",
    "You Got It","Jocelyn Flores","Holy","Diamonds","The Box","Daisy","September","If the World Was Ending",
    "SLOW DANCING IN THE DARK","FRANCHISE","forget me too","Ew","Your Man","Epidemic","Gimme Love",
    "bloody valentine","Daylight","MODUS","drunk face","Tick Tock","Better","Sanctuary","Run",
    "Pretty Boy","Got It On Me","Afterthought","all I know","Like You Do","kiss kiss","title track",
    "NITROUS","Runnin","Mr. Right Now","Glock In My Lap","Wonder","Rich Nigga Shit","Many Men",
    "Slidin","Bet You Wanna","Lovesick Girls","Outta Time","Brand New Draco","My Dawg","Snitches & Rats",
    "No Opp Left Behind","Steppin On Niggas","RIP Luv","Don't Stop","Sofia","Said N Done","Intro",
    "After Party","Pretty Savage","Heart Of Glass","How You Like That","Intentions","Crazy Over You",
    "Sleepy Hollow","Sweater Weather","Levitating","E-GIRLS ARE RUINING MY LIFE!","Baby, I'm Jealous",
    "Lonely","Blueberry Faygo","Hate The Way","Moonwalking in Calabasas","Whole Lotta Choppas",
    "Train Wreck","positions","Forever After All","Tyler Herro","Spicy","Beautiful Crazy",
    "Beer Never Broke My Heart","Life Is Good","Back to the Streets","Daddy Issues","Falling",
    "Chasin' You","Golden","Monster Mash","Excitement","Ghostbusters","Took Her To The O","Hawái - Remix",
    "ALWAYS DO","Took Her To The O","TRAGIC","Young Wheezy","All I Want for Christmas Is You","The Code",
    "What That Speed Bout!?","WITHOUT YOU","F*CK YOU, GOODBYE","Whoopty","Rockin' Around The Christmas Tree",
    "Therefore I Am","Drankin N Smokin","HOLIDAY","Stripes Like Burberry","That's It","Jingle Bell Rock",
    "Real Baby Pluto","Last Christmas","Santa Tell Me","It's the Most Wonderful Time of the Year",
    "It's Beginning to Look a Lot like Christmas","Marni On Me","Million Dollar Play","Sleeping On The Floor",
    "Plastic","Underneath the Tree","lovely","Monster","Somebody's Problem","Life Goes On","Prisoner",
    "Body","Still Goin Down","Circles","Fly To My Room","Cry Baby","Circles","Blue & Grey","Telepathy",
    "Let It Snow! Let It Snow! Let It Snow!","Dis-ease","Sleigh Ride","Let It Snow! Let It Snow! Let It Snow!",
    "Mistletoe","Feliz Navidad","Holly Jolly Christmas","Stay","Livin' The Dream","Hello",
    "A Holly Jolly Christmas - Single Version","A Holly Jolly Christmas","Movie","Line Without a Hook",
    "The Christmas Song","LA NOCHE DE ANOCHE","TE MUDASTE","YO VISTO ASÍ","HACIENDO QUE ME AMAS",
    "Wonderful Christmastime [Edited Version] - Remastered 2011 / Edited Version","Wonderful Christmastime",
    "TE DESEO LO MEJOR","BOOKER T","Do They Know It's Christmas? - 1984 Version","Do They Know It's Christmas?",
    "MALDITA POBREZA","HOY cobré","EL MUNDO ES MÍO","White Christmas","Angels Like You","LA DROGA",
    "It's Beginning to Look a Lot Like Christmas","Christmas","SORRY PAPI","Blue Christmas",
    "ANTES QUE SE ACABE","120","Run Rudolph Run - Single Version","Run Rudolph Run",
    "Santa Claus Is Coming To Town","TRELLAS","Baby, It's Cold Outside","Happy Xmas"
    }

artists = { "Internet Money","DaBaby","Jack Harlow","Juice WRLD","BTS","DJ Khaled","SZA","Jawsh 685","Big Sean",
           "Machine Gun Kelly","Chris Brown","DJ Khaled","SAINt JHN","BLACKPINK","Maluma","Tate McRae",
           "Big Sean","Kane Brown","Internet Money","Gunna","Post Malone","Lil Baby","WhoHeem","Powfu",
           "BØRNS","Topic","Lady Gaga","Labrinth","Megan Thee Stallion","The Kid LAROI","Rod Wave",
           "Taylor Swift","Polo G","Calvin Harris","J Balvin","Travis Scott","Surfaces","blackbear",
           "Morgan Wallen","THE SCOTTS","Pop Smoke","salem ilese","iann dior","Billie Eilish","Ozuna",
           "Arizona Zervas","Trevor Daniel","Wallows","Curtis Waters","Mustard","NLE Choppa","Kendrick Lamar",
           "Drake","YoungBoy Never Broke Again","Marshmello","Joel Corry","Ritt Momney","Vedo",
           "XXXTENTACION","Justin Bieber","Sam Smith","Roddy Ricch","Ashnikko","Joji","Fleetwood Mac",
           "ZAYN","21 Savage","Shawn Mendes","Bryson Tiller","Clairo","Don Toliver","Miley Cyrus",
           "Trippie Redd","The Neighbourhood","Dua Lipa","CORPSE","Bebe Rexha","Lil Mosey","G-Eazy",
           "DDG","Sada Baby","James Arthur","Ariana Grande","Luke Combs","Jack Harlow","Ty Dolla $ign",
           "The Kid LAROI","DaBaby","Future","Saweetie","Harry Styles","Bobby Pickett",
            "Trippie Redd","Ray Parker Jr.","King Von","The Kid LAROI","NAV","Mariah Carey","CJ",
           "Brenda Lee","Billie Eilish","Lil Nas X","Bobby Helms","Wham!","Andy Williams","Michael Buble",
           "Kelly Clarkson","Billie Eilish","Dean Martin","The Ronettes","Frank Sinatra","José Feliciano",
           "Burl Ives","Ricky Montgomery","Nat King Cole","Paul McCartney","Bing Crosby","Perry Como",
           "Darlene Love","Elvis Presley","Chuck Berry","The Jackson 5","Brett Eldredge","John Lennon"
}

def find_songs(line):
    for name in song_name:
        if name.lower() in line.lower() and ('song' in line.lower() or 'music' in line.lower()):
            return name

def find_artist(line):
    for artist in artists:
        if artist.lower() in line.lower():
            return artist

In [5]:
## filter song related
# # song
# song_process = udf(find_songs, StringType())
# DF = DF.withColumn('song_name',song_process('text'))

# ## artist
# artist_process = udf(find_artist, StringType())
# DF = DF.withColumn('artist_name',artist_process('text'))

# ## SQL
# DF.createOrReplaceTempView("df")
# result = spark.sql("SELECT * FROM df WHERE ( song_name IS NOT NULL) OR (artist_name IS NOT NULL)")

## all language

In [6]:
# folder = "all_wk0"
# df0 = sqlContext.read.parquet(folder).select('created_at','id','lang')

In [7]:
# folder = "all_wk1"
# df1 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk2"
# df2 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk3"
# df3 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk4"
# df4 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk5"
# df5 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk6"
# df6 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk7"
# df7 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk8"
# df8 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk9"
# df9 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk10"
# df10 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk11"
# df11 = sqlContext.read.parquet(folder).select('created_at','id','lang')
# folder = "all_wk12"
# df12 = sqlContext.read.parquet(folder).select('created_at','id','lang')

In [8]:
# spotify, billboard in text?!

In [9]:
# read spotify files
folder = 'all_spotify'
DF = sqlContext.read.parquet(folder)

In [10]:
# adding weeks for bins
## 
from pyspark.sql.functions import udf, array
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType,StringType,NumericType,IntegerType
from pyspark.sql.functions import countDistinct

def find_month(date: str):   
    month = date.split()[1]
    if month == 'Sep':
        m = 9
    elif month == 'Oct':
        m = 10
    elif month == 'Nov':
        m = 11
    elif month == 'Dec':
        m = 12
    elif month == 'Aug':
        m = 8
    return m

def find_day(date: str):
    return int(date.split()[2])

split_month = udf(find_month, IntegerType())
split_day = udf(find_day, IntegerType())

In [11]:
D = DF.withColumn('month',split_month('created_at'))
D = D.withColumn('day',split_day('created_at'))
D = D.withColumn('month_day', (D['month']*100 + D['day'])  )

In [7]:
# verify
# D.createOrReplaceTempView("df")
# result = spark.sql("SELECT month,day,month_day,COUNT(1) FROM df GROUP BY month,day,month_day HAVING month == 9  ")
# result.show(31)

## tweets metioned only song_name vs only artist_name

## only en

In [12]:
df_en = D.filter(D['lang']=='en')
# df_en.count()

In [13]:
# adding weeks
from pyspark.ml.feature import Bucketizer

week_split = [0,904,911,918,925,1002,1009,1016,1023,1030,1106,1113,1120,1127,1203]
buck = Bucketizer(inputCol = 'month_day'   , splits = week_split, outputCol='weeks')
dfbins = buck.transform(D)
# dfbins.select('created_at','weeks').show()
# dfbins.show(3,truncate= True)

dfbins.createOrReplaceTempView("df")
result = spark.sql("SELECT weeks,COUNT(1) FROM df GROUP BY weeks,lang HAVING lang='en' ORDER BY weeks ")
result.show()

+-----+--------+
|weeks|count(1)|
+-----+--------+
|  0.0| 5718824|
|  1.0| 5112478|
|  2.0| 4612646|
|  3.0| 4336484|
|  4.0| 4742509|
|  5.0| 5686053|
|  6.0| 5512342|
|  7.0| 4237039|
|  8.0| 3962829|
|  9.0| 4749645|
| 10.0| 3988297|
| 11.0| 4555603|
| 12.0| 5584133|
| 13.0|  568291|
+-----+--------+



In [14]:
from pyspark.ml.feature import Bucketizer

week_split = [0,904,911,918,925,1002,1009,1016,1023,1030,1106,1113,1120,1127,1203]
buck = Bucketizer(inputCol = 'month_day'   , splits = week_split, outputCol='weeks')

df_en = D.filter(D['lang']=='en')
dfbins_en = buck.transform(df_en)

In [18]:
dfbins_en.columns
ds = dfbins_en

In [30]:
dfbins_en.columns


['created_at',
 'id',
 'truncated',
 'lang',
 'user_id',
 'followers_count',
 'friends_count',
 'listed_count',
 'text',
 'hashtags',
 'display_url',
 'expanded_url',
 'url',
 'mentions_screen_name',
 'et_full_text',
 'et_hashtags',
 'et_display_url',
 'et_expanded_url',
 'et_url',
 'et_mentions_screen_name',
 'country',
 'country_code',
 'name',
 'place_type',
 'favorite_count',
 'reply_count',
 'retweet_count',
 'quote_count',
 'place',
 'song_name',
 'artist_name',
 'month',
 'day',
 'month_day',
 'weeks']

# avg length: 
> ### url, et_url, display_url, et_expanded_url, hashtags, et_hashtags
> ### avg followers_count, friends_count, listed_count
> ### trucated
> 

In [19]:
## calculate emoji count 
import re
from pyspark.sql.functions import udf, array
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType,StringType, ArrayType, FloatType

def remove_stop(line):
    #token = nltk.word_tokenize(line)
    WORD_RE = re.compile(r"[\w']+")
    token = WORD_RE.findall(line)
    tokens = [w.lower() for w in token if w.lower() not in STOPWORDS]
    return tokens

def find_emoji(line):
    ## remove Korean chrachters i.e. 지, 민, 방, 탄
    hangul = re.compile(u'[a-zA-Z0-9\u3131-\u3163\uac00-\ud7a3]+')  
    line = re.sub(hangul, "", line) 
    
    ## Japanese/hiragana
    hiragana = re.compile(u'[\u3040-\u309Fー]+') # == u'[ぁ-んー]+'
    line = re.sub(hiragana, "", line)
    # Japanese/Katakana
    Katakana = re.compile(u'[\u30A0-\u30FF]+') # == u'[ァ-ヾ]+'
    line = re.sub(Katakana, "", line)
    # Japanese/
    Kanji = re.compile(u'[\u4E00-\u9FFF]+') # == u'[一-龠々]+'
    line = re.sub(Kanji, "", line)
    # find emoji
    regex = re.compile(r'([\u263a-\U0001f645])')
    token_list = regex.findall(line)
    
    return token_list

# UDF
emoji = udf(find_emoji, ArrayType(StringType()))


In [27]:
from pyspark.sql.functions import sum as _sum
from pyspark.sql import functions as F
# tokenize
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType


def length(line):
    try:
        return len(line)
    except:
        return 0

def find_truncated(line):
    if line == True:
        return 1
    else:
        return 0
    
def find_http_in_tweets(text:str):
    if 'http' in text:
        return 1
    else:
        return 0


    
# UDF 
find_counts = udf(length, IntegerType())
find_truncated = udf(find_truncated, IntegerType())
find_http = udf(find_http_in_tweets, IntegerType())
# UDF counting words
countTokens = udf(lambda words: len(words), IntegerType())
countChars = udf(lambda text: len(text), IntegerType())


# add col
T = ds.select(['weeks','song_name','artist_name','url','et_url','display_url','et_expanded_url',
               'hashtags','et_hashtags','followers_count','friends_count','listed_count',
               'followers_count','truncated','text'])

## Regextokenize by regex
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
regexTokenized = regexTokenizer.transform(T)
T = regexTokenized.withColumn("tokens", countTokens(col("words"))
                             ).withColumn("char_count",countChars(col("text")))


T = T.withColumn('with_url',find_counts(T['url'])).withColumn('with_et_url',find_counts(T['et_url'])
                                            ).withColumn('with_display_url',find_counts(T['display_url'])
                                            ).withColumn('with_expanded_url',find_counts(T['et_expanded_url'])
                                            ).withColumn('hashtag_count',find_counts(T['hashtags'])
                                            ).withColumn('et_#_count',find_counts(T['et_hashtags'])
                                            ).withColumn("with_truncate", find_truncated(T['truncated']) 
                                            ).withColumn("with_http_tw", find_http(T['text']) 
                                            ).withColumn('senti_score',sentiment_analysis_udf('text')
                                            ).withColumn('emojis_token',find_counts(emoji('text')) )




In [29]:
# aggregate
df = T.groupby(['weeks','song_name','artist_name']).agg(F.mean('with_url').alias('url_avg'),
                                                   F.mean('with_et_url').alias('et_url_avg'),
                                                   F.mean('with_display_url').alias('display_url_avg'),
                                                   F.mean('hashtag_count').alias('hashtag_avg'),
                                                   F.mean('et_#_count').alias('et_hashtag_avg'),
                                                   F.mean('followers_count').alias('avg_followers'),
                                                   F.mean('friends_count').alias('avg_friends'),
                                                   F.mean('listed_count').alias('avg_listed'),
                                                   _sum('with_truncate').alias('sum_trunc'),
                                                   _sum('with_http_tw').alias('sum_http'),
                                                   F.mean('tokens').alias('avg_tokens'), 
                                                   F.mean('char_count').alias('avg_char'),
                                                   F.mean('senti_score').alias('avg_senti'),
                                                   F.mean('emojis_token').alias('avg_emoji_count'),
                                                   F.count('text').alias('tweets_count') ###
                                                   
                                                  )
df.show()

+-----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+---------+--------+------------------+------------------+--------------------+-------------------+------------+
|weeks|           song_name|   artist_name|             url_avg|          et_url_avg|     display_url_avg|         hashtag_avg|      et_hashtag_avg|     avg_followers|       avg_friends|        avg_listed|sum_trunc|sum_http|        avg_tokens|          avg_char|           avg_senti|    avg_emoji_count|tweets_count|
+-----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+---------+--------+------------------+------------------+--------------------+-------------------+------------+
|  5.0|                  Ew| Bryson Tiller|0.0370

In [31]:
# df.select('avg_emoji_count').sort('avg_emoji_count',ascending=False).show()
df.count() #15284

15284

In [32]:
import pandas as pd

df.toPandas().to_csv('/home/wusean/spotify_0410.csv')

# Count unique user_id

In [20]:
user = ds.groupby(['weeks','song_name','artist_name','user_id']).count().sort('count',ascending=False)
ds.groupby(['weeks','song_name','artist_name','user_id']).count().sort('count',ascending=False).show(10)

+-----+---------+-------------+-------------------+-----+
|weeks|song_name|  artist_name|            user_id|count|
+-----+---------+-------------+-------------------+-----+
| 12.0|     null| Taylor Swift| 985916593065476096| 1548|
| 12.0|       GO|         null|           24566116| 1358|
|  0.0|       GO|         null|           74580436| 1336|
| 11.0|       GO|         null|           24566116| 1296|
|  2.0|     null|          BTS|1069692236982415361| 1142|
| 12.0|     null|          NAV|            7532872|  924|
|  6.0|     null|       Future|1123949703232724993|  894|
| 10.0|     null|Justin Bieber|         3167508591|  869|
|  2.0|     null|          NAV|1238467680773894146|  813|
| 10.0|     null|Justin Bieber|1321709362583392257|  806|
+-----+---------+-------------+-------------------+-----+
only showing top 10 rows



In [None]:
# save to csv
user.toPandas().to_csv('/home/wusean/spotify_unique_user_0410.csv')


# if-idf word counts (including stopwords)

In [11]:
# tokenize
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType


regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())
countChars = udf(lambda text: len(text), IntegerType())

## Regextokenize by regex
regexTokenized = regexTokenizer.transform(dfbins_en)
df_words = regexTokenized.select("weeks","song_name","artist_name","text", "words") \
    .withColumn("tokens", countTokens(col("words"))).withColumn("char_count",countChars(col("text")))

# tokens are word counts
# count characters in text, including spaces

## remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df_words = remover.transform(df_words)

df_words.select('weeks',"song_name","artist_name", "words",'tokens','char_count','filtered').show(truncate=True)

+-----+---------+-----------+--------------------+------+----------+--------------------+
|weeks|song_name|artist_name|               words|tokens|char_count|            filtered|
+-----+---------+-----------+--------------------+------+----------+--------------------+
|  0.0|     null|        BTS|[alright, bts, le...|     7|        29|[alright, bts, le...|
|  0.0|     null|        BTS|[best, pop, i, m,...|     8|        44|[best, pop, m, re...|
|  0.0|     null|        BTS|[rt, mtv, another...|    21|       118|[rt, mtv, another...|
|  0.0|     null|        BTS|[bts, doesn, t, k...|    18|        90|[bts, doesn, know...|
|  0.0|     null|        BTS|[rt, ebonieseok, ...|    23|       119|[rt, ebonieseok, ...|
|  0.0|     null|        BTS|[rt, mtv, another...|    21|       118|[rt, mtv, another...|
|  0.0|     null|        BTS|    [best, pop, bts]|     3|        12|    [best, pop, bts]|
|  0.0|     null|        SZA|[i, have, joined,...|    10|        54|[joined, black, h...|
|  0.0|   

In [15]:
# average tweet length
from pyspark.sql import functions as F


avg_words = df_words.groupBy(['weeks','song_name','artist_name']).agg(F.mean('tokens'), F.mean('char_count')
                                                                ).sort('avg(tokens)',ascending=False)
avg_words.show()

+-----+-------------+-------------+-----------------+------------------+
|weeks|    song_name|  artist_name|      avg(tokens)|   avg(char_count)|
+-----+-------------+-------------+-----------------+------------------+
|  1.0|          Run|  Miley Cyrus|             33.0|             136.0|
|  8.0|         Stay| Mariah Carey|             33.0|             144.0|
|  3.0|           GO|   THE SCOTTS|             33.0|             158.0|
|  2.0|           GO|   THE SCOTTS|32.16230366492147|155.38743455497382|
|  2.0|         Flex|       Clairo|             32.0|             138.0|
| 11.0| Be Like That|       Future|             32.0|             142.0|
| 12.0|         Holy|    SAINt JHN|             32.0|             126.0|
|  8.0|       Wonder|    Sam Smith|             31.0|             140.0|
|  4.0|       lovely|Morgan Wallen|             31.0|             140.0|
|  6.0|    That's It|  Miley Cyrus|             31.0|             140.0|
|  8.0|    death bed|         null|             31.

# group weeks, song, artist and combine words

In [16]:
from pyspark.sql.functions import collect_list

# grouped_df = df_words.groupby(['weeks','song_name','artist_name']
#                           ).agg(collect_list('filtered').alias("grouped_filtered"))

In [17]:
# grouped_df.select('weeks',"song_name","artist_name", 'grouped_filtered').show(truncate=True)


In [18]:
# from pyspark.sql.functions import concat_ws

# temp = df_words.select('weeks',"song_name","artist_name", "words",'tokens','char_count','filtered')#.show(truncate=True)
# temp.withColumn("grouped_filtered", concat_ws(", ", "filtered")).show()

In [19]:
from pyspark.sql.functions import split, explode

# T = temp.withColumn("explode_word",explode('filtered'))
# T.select('weeks',"song_name","artist_name", 'filtered','explode_word').show()

In [20]:
from pyspark.sql.functions import collect_list

# group_df = T.groupby(['weeks','song_name','artist_name']
#                           ).agg(collect_list('explode_word').alias("group_wd"))

# group = group_df.select('weeks',"song_name","artist_name","group_wd")
# group.filter(col('weeks')==9).show()

In [21]:
count_wd = udf(lambda line: len(line), IntegerType())

# group.select("weeks","song_name","artist_name", count_wd(col("group_wd"))).show()


# TF-IDF

In [24]:
## TD-IDF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer


hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(df_words)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [25]:
rescaledData.select('weeks','song_name','artist_name','features').show()

+-----+---------+-----------+--------------------+
|weeks|song_name|artist_name|            features|
+-----+---------+-----------+--------------------+
|  0.0|     null|        BTS|(20,[14,15,17,19]...|
|  0.0|     null|        BTS|(20,[2,3,4,5,14,1...|
|  0.0|     null|        BTS|(20,[3,5,7,8,13,1...|
|  0.0|     null|        BTS|(20,[0,6,13,14,15...|
|  0.0|     null|        BTS|(20,[0,4,10,13,14...|
|  0.0|     null|        BTS|(20,[3,5,7,8,13,1...|
|  0.0|     null|        BTS|(20,[3,14,19],[0....|
|  0.0|     null|        SZA|(20,[4,9,10,11,18...|
|  0.0|     null|        BTS|(20,[2,6,9,11,12,...|
|  0.0|     null|  Lady Gaga|(20,[1,4,5,6,11,1...|
|  0.0|     null|        BTS|(20,[3,17,18,19],...|
|  0.0|     null|        BTS|(20,[2,3,19],[0.9...|
|  0.0|     null|        BTS|(20,[4,5,7,11,13]...|
|  0.0|     null|        BTS|(20,[1,2,3,5,6,9,...|
|  0.0|     null|        BTS|(20,[2,3,4,6,8,10...|
|  0.0|     null|        BTS|(20,[0,1,6,9,10,1...|
|  0.0|     null|        BTS|(2

# group

In [16]:
## TD-IDF


from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# group = group.filter(col('weeks')==9)
group = group_df.select('weeks',"song_name","artist_name","group_wd")

hashingTF = HashingTF(inputCol="group_wd", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(group)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [19]:
rescaledData.show()

+-----+-----------------+-------------------+--------------------+--------------------+--------------------+
|weeks|        song_name|        artist_name|            group_wd|         rawFeatures|            features|
+-----+-----------------+-------------------+--------------------+--------------------+--------------------+
|  9.0|             null|         Marshmello|[liked, spotify, ...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|  9.0|           Slidin|               null|[rt, colesoulpodc...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|  9.0|        Sunflower|       Harry Styles|[rt, thehscharts,...|(20,[2,5,10,11,13...|(20,[2,5,10,11,13...|
|  9.0|             null|          Burl Ives|[rt, southern_liv...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|
|  9.0|              Run|         Marshmello|[marshmellomusic,...|(20,[1,2,3,4,5,7,...|(20,[1,2,3,4,5,7,...|
|  9.0|           Better|                DDG|[baddgrl_, harkso...|(20,[1,2,4,10,11,...|(20,[1,2,4,10,11,...|
|  9.0|      WITHOU

In [23]:
import pandas as pd


# S = rescaledData.toPandas()
# S.to_csv('/home/wusean/spotify_tfidf.csv')

In [None]:
# df.write.csv('mycsv.csv')


## Vectorize

In [26]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="filtered", outputCol="vec_features")
model = cv.fit(df_words)
countVectorizer_feateures = model.transform(df_words)

In [23]:
countVectorizer_feateures.select('weeks','song_name','artist_name','vec_features').show()

+-----+---------+-----------+--------------------+
|weeks|song_name|artist_name|            features|
+-----+---------+-----------+--------------------+
|  0.0|     null|        BTS|(262144,[4,42,156...|
|  0.0|     null|        BTS|(262144,[3,4,9,32...|
|  0.0|     null|        BTS|(262144,[0,1,2,3,...|
|  0.0|     null|        BTS|(262144,[1,2,4,55...|
|  0.0|     null|        BTS|(262144,[0,4,101,...|
|  0.0|     null|        BTS|(262144,[0,1,2,3,...|
|  0.0|     null|        BTS|(262144,[4,32,37]...|
|  0.0|     null|        SZA|(262144,[1,2,272,...|
|  0.0|     null|        BTS|(262144,[0,1,2,18...|
|  0.0|     null|  Lady Gaga|(262144,[110,342,...|
|  0.0|     null|        BTS|(262144,[32,37,72...|
|  0.0|     null|        BTS|(262144,[32,37,54...|
|  0.0|     null|        BTS|(262144,[0,3,177,...|
|  0.0|     null|        BTS|(262144,[1,2,3,4,...|
|  0.0|     null|        BTS|(262144,[0,128,29...|
|  0.0|     null|        BTS|(262144,[1,2,23,1...|
|  0.0|     null|        BTS|  

In [47]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="group_wd", outputCol="vec_features")
model = cv.fit(group)
countVectorizer_feateures = model.transform(group)

Py4JJavaError: An error occurred while calling o702.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 143 in stage 40.0 failed 4 times, most recent failure: Lost task 143.3 in stage 40.0 (TID 20045, cavium-dn0021.arc-ts.umich.edu, executor 186): ExecutorLostFailure (executor 186 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 121438 ms
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
	at org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:176)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
countVectorizer_feateures.select('weeks','song_name','artist_name','vec_features').show()

In [None]:
# TF-idf
# https://spark.apache.org/docs/2.2.0/ml-features.html#tf-idf


# using Vader for sentiment analysis

In [21]:
## Vader sentiment calculation
## ref: https://github.com/cjhutto/vaderSentiment

# coding: utf-8
# Author: C.J. Hutto
# Thanks to George Berry for reducing the time complexity from something like O(N^4) to O(N).
# Thanks to Ewan Klein and Pierpaolo Pantone for bringing VADER into NLTK. Those modifications were awesome.
# For license information, see LICENSE.TXT

"""
If you use the VADER sentiment analysis tools, please cite:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
Sentiment Analysis of Social Media Text. Eighth International Conference on
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
"""
import os
import re
import math
import string
import codecs
import json
from itertools import product
from inspect import getsourcefile
from io import open

# ##Constants##

# (empirically derived mean sentiment intensity rating increase for booster words)
B_INCR = 0.293
B_DECR = -0.293

# (empirically derived mean sentiment intensity rating increase for using ALLCAPs to emphasize a word)
C_INCR = 0.733
N_SCALAR = -0.74

NEGATE = \
    ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

# booster/dampener 'intensifiers' or 'degree adverbs'
# http://en.wiktionary.org/wiki/Category:English_degree_adverbs

BOOSTER_DICT = \
    {"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, 
     "completely": B_INCR, "considerable": B_INCR, "considerably": B_INCR,
     "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormous": B_INCR, "enormously": B_INCR,
     "entirely": B_INCR, "especially": B_INCR, "exceptional": B_INCR, "exceptionally": B_INCR, 
     "extreme": B_INCR, "extremely": B_INCR,
     "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR, "frackin": B_INCR, "fracking": B_INCR,
     "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, 
     "fuckin": B_INCR, "fucking": B_INCR, "fuggin": B_INCR, "fugging": B_INCR,
     "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, 
     "incredible": B_INCR, "incredibly": B_INCR, "intensely": B_INCR, 
     "major": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR,
     "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR,
     "so": B_INCR, "substantially": B_INCR,
     "thoroughly": B_INCR, "total": B_INCR, "totally": B_INCR, "tremendous": B_INCR, "tremendously": B_INCR,
     "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utter": B_INCR, "utterly": B_INCR,
     "very": B_INCR,
     "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR,
     "kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR,
     "less": B_DECR, "little": B_DECR, "marginal": B_DECR, "marginally": B_DECR,
     "occasional": B_DECR, "occasionally": B_DECR, "partly": B_DECR,
     "scarce": B_DECR, "scarcely": B_DECR, "slight": B_DECR, "slightly": B_DECR, "somewhat": B_DECR,
     "sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR}

# check for sentiment laden idioms that do not contain lexicon words (future work, not yet implemented)
SENTIMENT_LADEN_IDIOMS = {"cut the mustard": 2, "hand to mouth": -2,
                          "back handed": -2, "blow smoke": -2, "blowing smoke": -2,
                          "upper hand": 1, "break a leg": 2,
                          "cooking with gas": 2, "in the black": 2, "in the red": -2,
                          "on the ball": 2, "under the weather": -2}

# check for special case idioms and phrases containing lexicon words
SPECIAL_CASES = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "badass": 1.5, "bus stop": 0.0,
                 "yeah right": -2, "kiss of death": -1.5, "to die for": 3, 
                 "beating heart": 3.1, "broken heart": -2.9 }


# #Static methods# #

def negated(input_words, include_nt=True):
    """
    Determine if input contains negation words
    """
    input_words = [str(w).lower() for w in input_words]
    neg_words = []
    neg_words.extend(NEGATE)
    for word in neg_words:
        if word in input_words:
            return True
    if include_nt:
        for word in input_words:
            if "n't" in word:
                return True
    '''if "least" in input_words:
        i = input_words.index("least")
        if i > 0 and input_words[i - 1] != "at":
            return True'''
    return False


def normalize(score, alpha=15):
    """
    Normalize the score to be between -1 and 1 using an alpha that
    approximates the max expected value
    """
    norm_score = score / math.sqrt((score * score) + alpha)
    if norm_score < -1.0:
        return -1.0
    elif norm_score > 1.0:
        return 1.0
    else:
        return norm_score


def allcap_differential(words):
    """
    Check whether just some words in the input are ALL CAPS
    :param list words: The words to inspect
    :returns: `True` if some but not all items in `words` are ALL CAPS
    """
    is_different = False
    allcap_words = 0
    for word in words:
        if word.isupper():
            allcap_words += 1
    cap_differential = len(words) - allcap_words
    if 0 < cap_differential < len(words):
        is_different = True
    return is_different


def scalar_inc_dec(word, valence, is_cap_diff):
    """
    Check if the preceding words increase, decrease, or negate/nullify the
    valence
    """
    scalar = 0.0
    word_lower = word.lower()
    if word_lower in BOOSTER_DICT:
        scalar = BOOSTER_DICT[word_lower]
        if valence < 0:
            scalar *= -1
        # check if booster/dampener word is in ALLCAPS (while others aren't)
        if word.isupper() and is_cap_diff:
            if valence > 0:
                scalar += C_INCR
            else:
                scalar -= C_INCR
    return scalar


class SentiText(object):
    """
    Identify sentiment-relevant string-level properties of input text.
    """

    def __init__(self, text):
        if not isinstance(text, str):
            text = str(text).encode('utf-8')
        self.text = text
        self.words_and_emoticons = self._words_and_emoticons()
        # doesn't separate words from\
        # adjacent punctuation (keeps emoticons & contractions)
        self.is_cap_diff = allcap_differential(self.words_and_emoticons)

    @staticmethod
    def _strip_punc_if_word(token):
        """
        Removes all trailing and leading punctuation
        If the resulting string has two or fewer characters,
        then it was likely an emoticon, so return original string
        (ie ":)" stripped would be "", so just return ":)"
        """
        stripped = token.strip(string.punctuation)
        if len(stripped) <= 2:
            return token
        return stripped

    def _words_and_emoticons(self):
        """
        Removes leading and trailing puncutation
        Leaves contractions and most emoticons
            Does not preserve punc-plus-letter emoticons (e.g. :D)
        """
        wes = self.text.split()
        stripped = list(map(self._strip_punc_if_word, wes))
        return stripped

class SentimentIntensityAnalyzer_1(object):
    """
    Give a sentiment intensity score to sentences.
    """

    def __init__(self, lexicon_dict, emoji_dict):
        self.lexicon = lexicon_dict
        self.emojis = emoji_dict

    def polarity_scores(self, text):
        """
        Return a float for sentiment strength based on the input text.
        Positive values are positive valence, negative value are negative
        valence.
        """
        # convert emojis to their textual descriptions
        text_no_emoji = ""
        prev_space = True
        for chr in text:
            if chr in self.emojis:
                # get the textual description
                description = self.emojis[chr]
                if not prev_space:
                    text_no_emoji += ' '
                text_no_emoji += description
                prev_space = False
            else:
                text_no_emoji += chr
                prev_space = chr == ' '
        text = text_no_emoji.strip()

        sentitext = SentiText(text)

        sentiments = []
        words_and_emoticons = sentitext.words_and_emoticons
        for i, item in enumerate(words_and_emoticons):
            valence = 0
            # check for vader_lexicon words that may be used as modifiers or negations
            if item.lower() in BOOSTER_DICT:
                sentiments.append(valence)
                continue
            if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and
                    words_and_emoticons[i + 1].lower() == "of"):
                sentiments.append(valence)
                continue

            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)

        sentiments = self._but_check(words_and_emoticons, sentiments)

        valence_dict = self.score_valence(sentiments, text)

        return valence_dict

    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
        is_cap_diff = sentitext.is_cap_diff
        words_and_emoticons = sentitext.words_and_emoticons
        item_lowercase = item.lower()
        if item_lowercase in self.lexicon:
            # get the sentiment valence 
            valence = self.lexicon[item_lowercase]
                
            # check for "no" as negation for an adjacent lexicon item vs "no" as its own stand-alone lexicon item
            if item_lowercase == "no" and i != len(words_and_emoticons)-1 and words_and_emoticons[i + 1].lower() in self.lexicon:
                # don't use valence of "no" as a lexicon item. Instead set it's valence to 0.0 and negate the next item
                valence = 0.0
            if (i > 0 and words_and_emoticons[i - 1].lower() == "no") \
               or (i > 1 and words_and_emoticons[i - 2].lower() == "no") \
               or (i > 2 and words_and_emoticons[i - 3].lower() == "no" and words_and_emoticons[i - 1].lower() in ["or", "nor"] ):
                valence = self.lexicon[item_lowercase] * N_SCALAR
            
            # check if sentiment laden word is in ALL CAPS (while others aren't)
            if item.isupper() and is_cap_diff:
                if valence > 0:
                    valence += C_INCR
                else:
                    valence -= C_INCR

            for start_i in range(0, 3):
                # dampen the scalar modifier of preceding words and emoticons
                # (excluding the ones that immediately preceed the item) based
                # on their distance from the current item.
                if i > start_i and words_and_emoticons[i - (start_i + 1)].lower() not in self.lexicon:
                    s = scalar_inc_dec(words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff)
                    if start_i == 1 and s != 0:
                        s = s * 0.95
                    if start_i == 2 and s != 0:
                        s = s * 0.9
                    valence = valence + s
                    valence = self._negation_check(valence, words_and_emoticons, start_i, i)
                    if start_i == 2:
                        valence = self._special_idioms_check(valence, words_and_emoticons, i)

            valence = self._least_check(valence, words_and_emoticons, i)
        sentiments.append(valence)
        return sentiments

    def _least_check(self, valence, words_and_emoticons, i):
        # check for negation case using "least"
        if i > 1 and words_and_emoticons[i - 1].lower() not in self.lexicon \
                and words_and_emoticons[i - 1].lower() == "least":
            if words_and_emoticons[i - 2].lower() != "at" and words_and_emoticons[i - 2].lower() != "very":
                valence = valence * N_SCALAR
        elif i > 0 and words_and_emoticons[i - 1].lower() not in self.lexicon \
                and words_and_emoticons[i - 1].lower() == "least":
            valence = valence * N_SCALAR
        return valence

    @staticmethod
    def _but_check(words_and_emoticons, sentiments):
        # check for modification in sentiment due to contrastive conjunction 'but'
        words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
        if 'but' in words_and_emoticons_lower:
            bi = words_and_emoticons_lower.index('but')
            for sentiment in sentiments:
                si = sentiments.index(sentiment)
                if si < bi:
                    sentiments.pop(si)
                    sentiments.insert(si, sentiment * 0.5)
                elif si > bi:
                    sentiments.pop(si)
                    sentiments.insert(si, sentiment * 1.5)
        return sentiments

    @staticmethod
    def _special_idioms_check(valence, words_and_emoticons, i):
        words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
        onezero = "{0} {1}".format(words_and_emoticons_lower[i - 1], words_and_emoticons_lower[i])

        twoonezero = "{0} {1} {2}".format(words_and_emoticons_lower[i - 2],
                                          words_and_emoticons_lower[i - 1], words_and_emoticons_lower[i])

        twoone = "{0} {1}".format(words_and_emoticons_lower[i - 2], words_and_emoticons_lower[i - 1])

        threetwoone = "{0} {1} {2}".format(words_and_emoticons_lower[i - 3],
                                           words_and_emoticons_lower[i - 2], words_and_emoticons_lower[i - 1])

        threetwo = "{0} {1}".format(words_and_emoticons_lower[i - 3], words_and_emoticons_lower[i - 2])

        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]

        for seq in sequences:
            if seq in SPECIAL_CASES:
                valence = SPECIAL_CASES[seq]
                break

        if len(words_and_emoticons_lower) - 1 > i:
            zeroone = "{0} {1}".format(words_and_emoticons_lower[i], words_and_emoticons_lower[i + 1])
            if zeroone in SPECIAL_CASES:
                valence = SPECIAL_CASES[zeroone]
        if len(words_and_emoticons_lower) - 1 > i + 1:
            zeroonetwo = "{0} {1} {2}".format(words_and_emoticons_lower[i], words_and_emoticons_lower[i + 1],
                                              words_and_emoticons_lower[i + 2])
            if zeroonetwo in SPECIAL_CASES:
                valence = SPECIAL_CASES[zeroonetwo]

        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
        n_grams = [threetwoone, threetwo, twoone]
        for n_gram in n_grams:
            if n_gram in BOOSTER_DICT:
                valence = valence + BOOSTER_DICT[n_gram]
        return valence

    @staticmethod
    def _sentiment_laden_idioms_check(valence, senti_text_lower):
        # Future Work
        # check for sentiment laden idioms that don't contain a lexicon word
        idioms_valences = []
        for idiom in SENTIMENT_LADEN_IDIOMS:
            if idiom in senti_text_lower:
                print(idiom, senti_text_lower)
                valence = SENTIMENT_LADEN_IDIOMS[idiom]
                idioms_valences.append(valence)
        if len(idioms_valences) > 0:
            valence = sum(idioms_valences) / float(len(idioms_valences))
        return valence

    @staticmethod
    def _negation_check(valence, words_and_emoticons, start_i, i):
        words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
        if start_i == 0:
            if negated([words_and_emoticons_lower[i - (start_i + 1)]]):  # 1 word preceding lexicon word (w/o stopwords)
                valence = valence * N_SCALAR
        if start_i == 1:
            if words_and_emoticons_lower[i - 2] == "never" and \
                    (words_and_emoticons_lower[i - 1] == "so" or
                     words_and_emoticons_lower[i - 1] == "this"):
                valence = valence * 1.25
            elif words_and_emoticons_lower[i - 2] == "without" and \
                    words_and_emoticons_lower[i - 1] == "doubt":
                valence = valence
            elif negated([words_and_emoticons_lower[i - (start_i + 1)]]):  # 2 words preceding the lexicon word position
                valence = valence * N_SCALAR
        if start_i == 2:
            if words_and_emoticons_lower[i - 3] == "never" and \
                    (words_and_emoticons_lower[i - 2] == "so" or words_and_emoticons_lower[i - 2] == "this") or \
                    (words_and_emoticons_lower[i - 1] == "so" or words_and_emoticons_lower[i - 1] == "this"):
                valence = valence * 1.25
            elif words_and_emoticons_lower[i - 3] == "without" and \
                    (words_and_emoticons_lower[i - 2] == "doubt" or words_and_emoticons_lower[i - 1] == "doubt"):
                valence = valence
            elif negated([words_and_emoticons_lower[i - (start_i + 1)]]):  # 3 words preceding the lexicon word position
                valence = valence * N_SCALAR
        return valence

    def _punctuation_emphasis(self, text):
        # add emphasis from exclamation points and question marks
        ep_amplifier = self._amplify_ep(text)
        qm_amplifier = self._amplify_qm(text)
        punct_emph_amplifier = ep_amplifier + qm_amplifier
        return punct_emph_amplifier

    @staticmethod
    def _amplify_ep(text):
        # check for added emphasis resulting from exclamation points (up to 4 of them)
        ep_count = text.count("!")
        if ep_count > 4:
            ep_count = 4
        # (empirically derived mean sentiment intensity rating increase for
        # exclamation points)
        ep_amplifier = ep_count * 0.292
        return ep_amplifier

    @staticmethod
    def _amplify_qm(text):
        # check for added emphasis resulting from question marks (2 or 3+)
        qm_count = text.count("?")
        qm_amplifier = 0
        if qm_count > 1:
            if qm_count <= 3:
                # (empirically derived mean sentiment intensity rating increase for
                # question marks)
                qm_amplifier = qm_count * 0.18
            else:
                qm_amplifier = 0.96
        return qm_amplifier

    @staticmethod
    def _sift_sentiment_scores(sentiments):
        # want separate positive versus negative sentiment scores
        pos_sum = 0.0
        neg_sum = 0.0
        neu_count = 0
        for sentiment_score in sentiments:
            if sentiment_score > 0:
                pos_sum += (float(sentiment_score) + 1)  # compensates for neutral words that are counted as 1
            if sentiment_score < 0:
                neg_sum += (float(sentiment_score) - 1)  # when used with math.fabs(), compensates for neutrals
            if sentiment_score == 0:
                neu_count += 1
        return pos_sum, neg_sum, neu_count

    def score_valence(self, sentiments, text):
        if sentiments:
            sum_s = float(sum(sentiments))
            # compute and add emphasis from punctuation in text
            punct_emph_amplifier = self._punctuation_emphasis(text)
            if sum_s > 0:
                sum_s += punct_emph_amplifier
            elif sum_s < 0:
                sum_s -= punct_emph_amplifier

            compound = normalize(sum_s)
            # discriminate between positive, negative and neutral sentiment scores
            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)

            if pos_sum > math.fabs(neg_sum):
                pos_sum += punct_emph_amplifier
            elif pos_sum < math.fabs(neg_sum):
                neg_sum -= punct_emph_amplifier

            total = pos_sum + math.fabs(neg_sum) + neu_count
            pos = math.fabs(pos_sum / total)
            neg = math.fabs(neg_sum / total)
            neu = math.fabs(neu_count / total)

        else:
            compound = 0.0
            pos = 0.0
            neg = 0.0
            neu = 0.0

        sentiment_dict = \
            {"neg": round(neg, 3),
             "neu": round(neu, 3),
             "pos": round(pos, 3),
             "compound": round(compound, 4)}

        return sentiment_dict



In [22]:
## dictionary for sentiment 

lexicon_file="vader_lexicon.txt"
emoji_lexicon="emoji_utf8_lexicon.txt"


def make_lex_dict(lexicon_full_filepath):
    """
    Convert lexicon file to a dictionary
    """
    lex_dict = {}
    for line in lexicon_full_filepath.rstrip('\n').split('\n'):
        if not line:
            continue
        (word, measure) = line.strip().split('\t')[0:2]
        lex_dict[word] = float(measure)
    return lex_dict

def make_emoji_dict(emoji_full_filepath):
    """
    Convert emoji lexicon file to a dictionary
    """
    emoji_dict = {}
    for line in emoji_full_filepath.rstrip('\n').split('\n'):
        (emoji, description) = line.strip().split('\t')[0:2]
        emoji_dict[emoji] = description
    return emoji_dict

with codecs.open(lexicon_file, encoding='utf-8') as f:
    lexicon_full_filepath = f.read()
    
lexicon_dict = make_lex_dict(lexicon_full_filepath)

with codecs.open(emoji_lexicon, encoding='utf-8') as f:
    emoji_full_filepath = f.read()
emojis_dict = make_emoji_dict(emoji_full_filepath)


In [23]:
from pyspark.sql.types import FloatType
# from textblob import TextBlob
# from vaderSentiment_1 import SentimentIntensityAnalyzer


def sentiment_analysis(text):
    analyzer = SentimentIntensityAnalyzer_1(lexicon_dict,emojis_dict)
    return analyzer.polarity_scores(text)['compound'] ## only chec 'compound'

sentiment_analysis_udf = udf(sentiment_analysis , FloatType())



In [24]:
temp = dfbins_en.select('weeks','song_name','artist_name','text')

In [25]:
temp = temp.withColumn('senti_score',sentiment_analysis_udf('text'))
temp.select('weeks','song_name','artist_name','senti_score').sort('senti_score',ascending=False).show()

+-----+---------+------------+-----------+
|weeks|song_name| artist_name|senti_score|
+-----+---------+------------+-----------+
|  0.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
|  7.0|     null|         BTS|        1.0|
| 12.0|     null|         BTS|        1.0|
|  5.0|     null|Harry Styles|        1.0|
| 11.0|     null|      CORPSE|        1.0|
|  2.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
|  8.0|     null|Harry Styles|        1.0|
|  2.0|     null|         BTS|        1.0|
|  9.0|     null|         BTS|        1.0|
|  9.0|     null|Harry Styles|        1.0|
|  5.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
|  2.0|     null|         BTS|        1.0|
+-----+----

In [26]:
from pyspark.sql import functions as F

# df_emoji.groupBy(['song_name','weeks']).agg(F.mean('emoji_score'), F.count('emoji_score')
#                                            ).sort('avg(emoji_score)',ascending=False).show()

spotify_senti = temp.groupby('weeks','song_name','artist_name').agg(F.mean('senti_score')
                                                   ).sort('avg(senti_score)',ascending=False)
spotify_senti.show()

+-----+--------------------+--------------+------------------+
|weeks|           song_name|   artist_name|  avg(senti_score)|
+-----+--------------------+--------------+------------------+
| 11.0|              Wonder|      Lil Baby|0.9975000023841858|
|  0.0|                 Run|Brett Eldredge|0.9937000274658203|
|  0.0|Beer Never Broke ...|    Luke Combs| 0.991599977016449|
|  6.0|              Wonder|          Vedo|0.9889000058174133|
|  2.0|               Roses|  Taylor Swift|0.9878000020980835|
|  2.0|               Hello|  Taylor Swift|0.9871000051498413|
|  8.0|               Intro|    Luke Combs|0.9850999712944031|
|  8.0|             HOLIDAY|         Drake|0.9848999977111816|
|  6.0|              lovely|           SZA|0.9847000241279602|
|  7.0|              Slidin|         Topic|0.9814000129699707|
|  9.0|   How You Like That|        Future|0.9812999963760376|
|  0.0|               Hello|     Lady Gaga|0.9796000123023987|
|  0.0|                Body| Ariana Grande|0.9787999987

In [24]:
import pandas as pd

S = spotify_senti.toPandas()
S.to_csv('/home/wusean/spotify_senti.csv')