# Exercises list of Ted talks data

1 -Finding the most popular TED talks

2 - Finding the most popular TED talks Speaker (in terms of number of talks)

3 - Month-wise Analysis of TED talk frequency

4 - Year-wise Analysis of TED talk frequency

5 - Finding TED talks of your favorite Author

6 - Finding TED talks with the best view to like ration 

7 - Finding TED talks based on tags(like climate)

8 -Finding the most popular TED talks Speaker (in terms of number of views)


In [292]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField,StructType,IntegerType,DateType,StringType
from pyspark.sql.functions import desc, asc, split, col
from pyspark.sql import functions as F


In [199]:
spark = SparkSession.builder.appName('ted_talks_data').getOrCreate()


In [203]:
df = spark.read.csv('data.csv',header=True, schema=dataTalksSchema)

In [204]:
dataTalksSchema = StructType([
    StructField('title',StringType(), True),
    StructField('author',StringType(), True),
    StructField('date',StringType(), True),
    StructField('views',IntegerType(), True),
    StructField('likes',IntegerType(), True),
    StructField('link',StringType(), True)
])

df = spark.read.csv('data.csv',header=True, schema=dataTalksSchema)

df.printSchema()

root
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- date: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- link: string (nullable = true)



In [205]:
#1 - Finding author most popular TED talks
most_popular_ted_talk = df.orderBy(desc('views')).first()
most_popular_speaker

Row(author='Alex Gendler', count(views)=45)

In [330]:
# 2 - Finding the most popular TED talks Speaker (in terms of number of talks)
most_popular_speaker = df.groupBy('author').agg(
    {'views':'count'}
    ).withColumnRenamed("count(views)","talks").orderBy(desc('talks'))

most_popular_speaker.show()

+----------------+-----+
|          author|talks|
+----------------+-----+
|    Alex Gendler|   45|
|Iseult Gillespie|   33|
|     Matt Walker|   18|
|  Alex Rosenthal|   15|
|   Elizabeth Cox|   13|
|      Emma Bryce|   12|
|   Juan Enriquez|   11|
|   Daniel Finkel|   11|
|    Mona Chalabi|    9|
|    Hans Rosling|    9|
|       Greg Gage|    9|
|      Dan Finkel|    9|
|Wendy De La Rosa|    9|
|      Jen Gunter|    9|
|      Bill Gates|    8|
|    Dan Kwartler|    7|
|          TED-Ed|    7|
|   Marco Tempest|    7|
|           Rives|    6|
|     A.J. Jacobs|    6|
+----------------+-----+
only showing top 20 rows



In [260]:
# 3 - Month-wise Analysis of TED talk frequency
frequence_by_month = df.groupBy('date').agg({'date':'count'}).orderBy(desc('count(date)'))
frequence_by_month.show()

+-------------+-----------+
|         date|count(date)|
+-------------+-----------+
|   April 2018|        126|
|   April 2019|        124|
|   April 2017|        123|
|November 2018|        114|
|November 2017|        109|
| October 2020|         97|
|   March 2014|         96|
|February 2016|         89|
|   March 2015|         88|
|    June 2012|         84|
|February 2009|         84|
| October 2014|         83|
|    June 2013|         80|
|February 2013|         77|
|   March 2011|         76|
| October 2021|         72|
|February 2010|         70|
|    July 2011|         70|
|November 2015|         68|
|    June 2020|         67|
+-------------+-----------+
only showing top 20 rows



In [261]:
# 4 - Year-wise Analysis of TED talk frequency
custom_df = df.withColumn('year',split(df['date'],' ').getItem(1))
custom_df = custom_df.drop('date')

frequence_by_year = custom_df.groupBy('year').agg({'year':'count'}).orderBy(desc('count(year)'))
frequence_by_year.show()

+----+-----------+
|year|count(year)|
+----+-----------+
|2019|        543|
|2020|        501|
|2017|        495|
|2018|        471|
|2016|        399|
|2021|        390|
|2013|        388|
|2015|        377|
|2014|        357|
|2012|        302|
|2011|        271|
|2010|        267|
|2009|        233|
|2007|        113|
|2008|         83|
|2005|         65|
|2006|         49|
|2003|         34|
|2004|         33|
|2002|         26|
+----+-----------+
only showing top 20 rows



In [262]:
df.show(15)

+--------------------+--------------------+--------------+------+-----+--------------------+
|               title|              author|          date| views|likes|                link|
+--------------------+--------------------+--------------+------+-----+--------------------+
|Climate action ne...|Ozawa Bineshi Albert| December 2021|404000|12000|https://ted.com/t...|
|The dark history ...|       Sydney Iaukea| February 2022|214000| 6400|https://ted.com/t...|
|How play can spar...|       Martin Reeves|September 2021|412000|12000|https://ted.com/t...|
|Why is China appo...|   James K. Thornton|  October 2021|427000|12000|https://ted.com/t...|
|Cement's carbon p...|     Mahendra Singhi|  October 2021|  2400|   72|https://ted.com/t...|
|The tragedy of ai...|Rosamund Adoo-Kis...|  October 2021|422000|12000|https://ted.com/t...|
|The myth of Narci...|    Iseult Gillespie| February 2022|412000|12000|https://ted.com/t...|
|You deserve the r...|    Gay Gordon-Byrne|   August 2021|455000|13000

In [271]:
# 5 - Finding TED talks of your favorite Author (Iseult Gillespie)
df_favorite_author = df.filter(df.author == 'Iseult Gillespie')
df_favorite_author.show()

+--------------------+----------------+--------------+-------+------+--------------------+
|               title|          author|          date|  views| likes|                link|
+--------------------+----------------+--------------+-------+------+--------------------+
|The myth of Narci...|Iseult Gillespie| February 2022| 412000| 12000|https://ted.com/t...|
|The tragic myth o...|Iseult Gillespie|  January 2022| 998000| 29000|https://ted.com/t...|
|The myth of Loki'...|Iseult Gillespie|  January 2022| 761000| 22000|https://ted.com/t...|
|"The Norse myth t...|Iseult Gillespie|  October 2021|1500000| 45000|https://ted.com/t...|
|How one person sa...|Iseult Gillespie|     June 2021|1200000| 36000|https://ted.com/t...|
|The Japanese myth...|Iseult Gillespie|  January 2021| 739000| 22000|https://ted.com/t...|
|Savitri and Satya...|Iseult Gillespie|  January 2021|3900000|117000|https://ted.com/t...|
|The myth of Loki ...|Iseult Gillespie| December 2020|1900000| 59000|https://ted.com/t...|

In [289]:
# 6 - Finding TED talks with the best view to like ration
df_best_view = df.groupBy('title').agg({'views':'max'}).orderBy(desc('max(views)')).first()
df_best_view

Row(title='Do schools kill creativity?', max(views)=72000000)

In [299]:
# 7 - Finding TED talks based on tags(like climate)
df.filter("title LIKE '%climate%'").toPandas()

Unnamed: 0,title,author,date,views,likes,link
0,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...
1,The ocean's ingenious climate solutions,Susan Ruffo,October 2021,522000,15000,https://ted.com/talks/susan_ruffo_the_ocean_s_...
2,How we're reducing the climate impact of elect...,Tim Dunn,February 2022,96000,2900,https://ted.com/talks/tim_dunn_how_we_re_reduc...
3,How to boost nature-based solutions to climate...,Zac Goldsmith,October 2021,30000,913,https://ted.com/talks/zac_goldsmith_how_to_boo...
4,"The unexpected, underwater plant fighting clim...",Carlos M. Duarte,August 2021,786000,23000,https://ted.com/talks/carlos_m_duarte_the_unex...
...,...,...,...,...,...,...
81,The science behind a climate headline,Rachel Pike,July 2009,1000000,31000,https://ted.com/talks/rachel_pike_the_science_...
82,New thinking on the climate crisis,Al Gore,March 2008,2200000,68000,https://ted.com/talks/al_gore_new_thinking_on_...
83,A critical look at geoengineering against clim...,David Keith,September 2007,1400000,44000,https://ted.com/talks/david_keith_a_critical_l...
84,Global priorities bigger than climate change,Bjorn Lomborg,February 2005,1700000,53000,https://ted.com/talks/bjorn_lomborg_global_pri...


In [338]:
#8 -Finding the most popular TED talks Speaker (in terms of number of views)
most_popular_speaker = df.groupBy('author').agg({'views':'max'}).orderBy(desc('max(views)')).first()
most_popular_speaker

Row(author='Sir Ken Robinson', max(views)=72000000)