# Popularity comparison vs Youtube's

## Loading data

In [0]:
import pyspark.sql.functions as F

In [0]:
ACCESS_KEY_ID = "KEY_ID" # cle du compte student
SECRET_ACCESS_KEY = "KEY"

hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.access.key", ACCESS_KEY_ID)
hadoop_conf.set("fs.s3a.secret.key", SECRET_ACCESS_KEY)
hadoop_conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem") 

playlog = spark.read.format("csv").option("header", "true").option("inferSchema","true").load("s3")

from pyspark.sql.functions import unix_timestamp, from_unixtime
playlog = playlog \
  .withColumn('datetime', from_unixtime('timestamp')) \
  .drop('timestamp') \
  .orderBy('datetime')
import datetime
from pyspark.sql.functions import year, month, dayofmonth, dayofweek, dayofyear, weekofyear
playlog = playlog \
  .withColumn('year', year('datetime')) \
  .withColumn('month', month('datetime')) \
  .withColumn('dayofmonth', dayofmonth('datetime')) \
  .withColumn('dayofyear', dayofyear('datetime')) \
  .withColumn('weekofyear', weekofyear('datetime'))

playlog.printSchema()
playlog.count(), len(playlog.columns)
playlog.limit(5).toPandas()

root
 |-- user: integer (nullable = true)
 |-- song: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- dayofyear: integer (nullable = true)
 |-- weekofyear: integer (nullable = true)



Unnamed: 0,user,song,datetime,year,month,dayofmonth,dayofyear,weekofyear
0,4,nRa-eGzpT6o,1965-07-26 03:21:43,1965,7,26,207,30
1,0,t1l8Z6gLPzo,2014-02-14 14:18:53,2014,2,14,45,7
2,22,Q24VZL8wpOM,2014-02-14 14:18:57,2014,2,14,45,7
3,70,VJ6ofd0pB_c,2014-02-14 14:18:57,2014,2,14,45,7
4,1,t1l8Z6gLPzo,2014-02-14 14:18:58,2014,2,14,45,7


In [0]:
songs = spark.read.parquet("s3")
songs.printSchema()
songs.count(), len(songs.columns)

root
 |-- contentDetails_duration: string (nullable = true)
 |-- id: string (nullable = true)
 |-- snippet_channelId: string (nullable = true)
 |-- snippet_channelTitle: string (nullable = true)
 |-- snippet_publishedAt: string (nullable = true)
 |-- snippet_title: string (nullable = true)
 |-- statistics_commentCount: long (nullable = true)
 |-- statistics_dislikeCount: long (nullable = true)
 |-- statistics_viewCount: long (nullable = true)

Out[2]: (3907, 9)

In [0]:
songs.limit(5).toPandas().head()

Unnamed: 0,contentDetails_duration,id,snippet_channelId,snippet_channelTitle,snippet_publishedAt,snippet_title,statistics_commentCount,statistics_dislikeCount,statistics_viewCount
0,PT3M33S,t1l8Z6gLPzo,UCUERSOitwgUq_37kGslN96w,VOLO,2013-07-22T12:09:11Z,"VOLO. ""L'air d'un con""",38,26,223172
1,PT7M46S,we5gzZq5Avg,UCson549gpvRhPnJ3Whs5onA,LongWayToDream,2012-03-17T08:34:30Z,Julian Jeweil - Air Conditionné,2,3,13409
2,PT3M7S,49esza4eiK4,UCcHYZ8Ez4gG_2bHEuBL8IfQ,Downtown Records,2007-09-08T02:02:07Z,Justice - D.A.N.C.E,3168,780,10106655
3,PT3M43S,BoO6LfR7ca0,UCQ0wLCF7u23gZKJkHFs1Tpg,Music Is Our Drug,2014-01-24T12:52:38Z,Gramatik - Torture (feat. Eric Krasno),6,0,29153
4,PT5M,DaH4W1rY9us,UCJsTMPZxYD-Q3kEmL4Qijpg,Harvey Pearson,2012-12-02T12:41:13Z,Ben Howard - Oats In The Water,5303,1784,16488714


## Computing a variable: `statistics_playCount`
We already have a few statistics from the data we collected from Youtube, e.g. `statistics_viewCount`, `statistics_commentCount` and `statistics_dislikeCount`.  
We will compute a new statistics `statistics_playCount`, this one based on our user history: the number of times this song has been played on our service.

We will save this as a new DataFrame `playcounts` that we will later merge with others.

1. Compute `playcounts` the playcount for each song in the playlog alias the new column to `statistics_playCount`

In [0]:
playlog.limit(10).toPandas().head()

Unnamed: 0,user,song,datetime,year,month,dayofmonth,dayofyear,weekofyear
0,4,nRa-eGzpT6o,1965-07-26 03:21:43,1965,7,26,207,30
1,0,t1l8Z6gLPzo,2014-02-14 14:18:53,2014,2,14,45,7
2,70,VJ6ofd0pB_c,2014-02-14 14:18:57,2014,2,14,45,7
3,22,Q24VZL8wpOM,2014-02-14 14:18:57,2014,2,14,45,7
4,1,t1l8Z6gLPzo,2014-02-14 14:18:58,2014,2,14,45,7


In [0]:
playcounts = playlog \
  .groupBy('song') \
  .agg(F.count('*').alias('statistics_playcount')) \
  .orderBy(F.desc('statistics_playcount'))

In [0]:
playcounts.limit(5).toPandas()

Unnamed: 0,song,statistics_playcount
0,SYM-RJwSGQ8,25874
1,UfR3nAz8z3Q,19537
2,MYSVMgRr6pw,18178
3,6ktYpaGVUe0,16032
4,bpOSxM0rNPM,14521


Now we will join this with our `songs` DataFrame. Call the resulting DataFrame `songs_with_playcount`.

2. join `songs` with `playcounts`: `songs_with_playcount`. Then: 
- print out the schema of the new DataFrame
- print out the first 5 rows

In [0]:
songs_with_playcount = songs.join(playcounts,songs.id == playcounts.song).drop('song')
songs_with_playcount.limit(5).toPandas()

Unnamed: 0,contentDetails_duration,id,snippet_channelId,snippet_channelTitle,snippet_publishedAt,snippet_title,statistics_commentCount,statistics_dislikeCount,statistics_viewCount,statistics_playcount
0,PT3M59S,3vFKqs32lyA,UCPUVB4vRHi8Yg20IFI89PKQ,sunaker,2012-12-22T07:39:22Z,Kaveret - Yo Ya,27,7,39035,17
1,PT3M48S,jFS8mVhP0xg,UCN4BoFTD9YncU-NpJr8YyBw,HHVIBE,2013-10-09T16:31:44Z,Restless Leg Syndrome - Sharitt Casette,14,7,53405,124
2,PT3M12S,hicCHaC_z5I,UCCbpTuRINyfjtwFkjHuII1w,mau5trap,2012-10-11T18:37:14Z,Feed Me & Crystal Fighters - Love Is All I Got...,2422,1072,7768503,462
3,PT2M56S,yc7TpfKOgwA,UCmIpgKCKzgzOy_Ju_GDClNw,Walrus81,2012-03-20T13:02:08Z,The Smashing Pumpkins - Stumbleine,1,0,1454,2
4,PT4M27S,_iujjGCoF4g,UCnUbD-Jek5_8IfDbMJ3rn7Q,92JuggaLotus503,2009-01-02T23:22:40Z,Aesop Rock- Daylight,193,82,484936,86


In [0]:
songs_with_playcount.printSchema()

root
 |-- contentDetails_duration: string (nullable = true)
 |-- id: string (nullable = true)
 |-- snippet_channelId: string (nullable = true)
 |-- snippet_channelTitle: string (nullable = true)
 |-- snippet_publishedAt: string (nullable = true)
 |-- snippet_title: string (nullable = true)
 |-- statistics_commentCount: long (nullable = true)
 |-- statistics_dislikeCount: long (nullable = true)
 |-- statistics_viewCount: long (nullable = true)
 |-- statistics_playcount: long (nullable = false)



## Performing analysis
We will plot a few graph.

3. Plot an histogram of log of `statistics_playCount`

In [0]:
display(songs_with_playcount.select(F.log('statistics_playCount')))

ln(statistics_playCount)
2.833213344056216
4.820281565605037
6.135564891081739
0.6931471805599453
4.454347296253507
3.1354942159291497
2.19722457733622
1.6094379124341005
3.9318256327243257
3.784189633918261


4. Plot the histogram of the play count divided by the view count

In [0]:
display(songs_with_playcount.select(F.col('statistics_playCount') / F.col('statistics_viewCount')))

(statistics_playCount / statistics_viewCount)
0.0004355065966440374
0.0023218799737852
5.947091736979441e-05
0.0013755158184319
0.00017734298959037894
0.0005175983436853002
0.0012751487673561
7.698026734015163e-07
0.00012829543167639364
3.480679460273186e-05


The distribution is long tail. We will take the log of this and plot it.

5. Plot the log of what you just plotted

In [0]:
display(songs_with_playcount.select(F.log(F.col('statistics_playCount') / F.col('statistics_viewCount'))))

ln((statistics_playCount / statistics_viewCount))
-7.739000614498175
-6.065378087918759
-9.730023148607316
-6.588926477533519
-8.637424906195902
-7.566311014772463
-6.664692426898996
-14.077131623250736
-8.961174893551876
-10.265697943037162


6. Plot an a scatter plot of `statistics_playCount` vs `statistics_playcount`. Tick "Show LOESS"

In [0]:
display(songs_with_playcount.select(F.col('statistics_playCount'),F.col('statistics_viewCount')))

statistics_playCount,statistics_viewCount
17,39035
124,53405
462,7768503
2,1454
86,484936
23,44436
9,7058
5,6495171
51,397520
44,1264121


7. Plot an a scatter plot of log `statistics_playCount` vs log `statistics_playcount`. Tick "Show LOESS"

ln(statistics_viewCount),ln(statistics_playCount)
10.572213958554393,2.833213344056216
10.885659653523795,4.820281565605037
15.865588039689056,6.135564891081739
7.282073658093465,0.6931471805599453
13.09177220244941,4.454347296253507
10.701805230701613,3.1354942159291497
8.861917004235215,2.19722457733622
15.686569535684836,1.6094379124341005
12.893000526276202,3.9318256327243257
14.049887576955422,3.784189633918261


8. Plot multiple scatter plots (grid plot?) of all `statistics_` columns

In [0]:
statistics_cols = (c for c in songs_with_playcount.columns if c.startswith('statistics_'))
display(songs_with_playcount.select(*(F.log(c) for c in statistics_cols)))

ln(statistics_commentCount),ln(statistics_dislikeCount),ln(statistics_viewCount),ln(statistics_playcount)
3.295836866004329,1.9459101490553128,10.572213958554393,2.833213344056216
2.6390573296152584,1.9459101490553128,10.885659653523795,4.820281565605037
7.792348924113037,6.977281341630747,15.865588039689056,6.135564891081739
0.0,,7.282073658093465,0.6931471805599453
5.262690188904886,4.406719247264253,13.09177220244941,4.454347296253507
2.5649493574615367,3.044522437723423,10.701805230701613,3.1354942159291497
3.828641396489095,0.0,8.861917004235215,2.19722457733622
8.494743062578646,7.221835825288449,15.686569535684836,1.6094379124341005
4.762173934797756,4.330733340286331,12.893000526276202,3.9318256327243257
6.270988431858299,5.030437921392435,14.049887576955422,3.784189633918261
