In [1]:
!pip install pyspark



In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.csv('videos-stats.csv')

In [6]:
df.show(8)

+----+--------------------+-----------+------------+-------+--------+--------+---------+
| _c0|                 _c1|        _c2|         _c3|    _c4|     _c5|     _c6|      _c7|
+----+--------------------+-----------+------------+-------+--------+--------+---------+
|NULL|               Title|   Video ID|Published At|Keyword|   Likes|Comments|    Views|
|   0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0| 135612.0|
|   1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|1758063.0|
|   2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|1564007.0|
|   3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0| 922918.0|
|   4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|1855644.0|
|   5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0| 943119.0|
|   6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|5937790.0|
+----+---------------

In [7]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)



In [8]:
df = spark.read.option('header', 'true').option('inferSchema', 'true').csv('videos-stats.csv')

In [9]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: double (nullable = true)
 |-- Comments: double (nullable = true)
 |-- Views: double (nullable = true)



In [11]:
df.write.option('header', 'true').save('output/videos_parquet')

In [14]:
df = spark.read.option('header', 'true').parquet('output/videos_parquet')
df.show()

+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|  4782514.0|
|  8|15 Em

In [16]:
spark.catalog.listTables()

[]

In [17]:
df.write.option('header', 'true').saveAsTable('tb_videos')

In [18]:
df.createOrReplaceTempView('tb_videos')

In [19]:
spark.catalog.listTables()

[Table(name='tb_videos', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='tb_videos', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [20]:
spark.catalog.setCurrentDatabase('default')

In [21]:
spark.catalog.listTables()

[Table(name='tb_videos', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='tb_videos', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [22]:
tab_df = spark.sql('select * from tb_videos')

tab_df.show()

+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|  4782514.0|
|  8|15 Em

In [26]:
spark2 = SparkSession.builder.getOrCreate()

In [27]:
df2 = spark.read.csv('comments.csv')

In [28]:
df2 = spark.read.option('header', 'true').option('inferSchema', 'true').csv('comments.csv')
df2.show()

+--------------+-----------+--------------------+------+---------+
|           _c0|   Video ID|             Comment| Likes|Sentiment|
+--------------+-----------+--------------------+------+---------+
|             0|wAZZ-UWGVHI|Let's not forget ...|  95.0|      1.0|
|             1|wAZZ-UWGVHI|Here in NZ 50% of...|  19.0|      0.0|
|             2|wAZZ-UWGVHI|I will forever ac...| 161.0|      2.0|
|             3|wAZZ-UWGVHI|Whenever I go to ...|   8.0|      0.0|
|             4|wAZZ-UWGVHI|Apple Pay is so c...|  34.0|      2.0|
|             5|wAZZ-UWGVHI|Weâ€™ve been houndi...|   8.0|      1.0|
|             6|wAZZ-UWGVHI|We only got Apple...|  29.0|      2.0|
|             7|wAZZ-UWGVHI|For now, I need b...|   7.0|      1.0|
|             8|wAZZ-UWGVHI|In the United Sta...|   2.0|      2.0|
|             9|wAZZ-UWGVHI|In Cambodia, we h...|  28.0|      1.0|
|            10|b3x28s61q3c|Wow, you really w...|1344.0|      2.0|
|            11|b3x28s61q3c|The lab is the mo...| 198.0|    

In [29]:
df2.write.option('header', 'true').save('output/comments_parquet')

In [35]:
spark.stop()