In [1]:
!pip install tabula-py

Collecting tabula-py
  Downloading tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Downloading tabula_py-2.10.0-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tabula-py
Successfully installed tabula-py-2.10.0


In [2]:
from pyspark.sql import SparkSession
import tabula
from pyspark.sql.functions import collect_list, avg, col, max, min, length

In [3]:
spark = SparkSession.builder.master('local[*]').appName("Lab 08").getOrCreate()

In [4]:
tabula.convert_into("Movies.csv.pdf", "Movies.csv", output_format="csv", pages="all")



In [5]:
df = spark.read.csv("Movies.csv", inferSchema=True, header=True)

df.show()

+----+------+----------------+--------------------+--------------------+-----------------+--------------+-----------+------+--------------+
|Year|Length|           Title|               Genre|               Actor|          Actress|      Director| Popularity|Awards|         Image|
+----+------+----------------+--------------------+--------------------+-----------------+--------------+-----------+------+--------------+
|1990|   111|  Tie Me Up! Tie|       Ceo mDeodwyn!|        BanderasAnto|  iAobrilVictoria|  AlmodóvarPed|        o68|    No| NicholasCage.|
|1991|   113|      High Heels|              Comedy|          BoséMiguel|    AbrilVictoria|  AlmodóvarPed|        o68|    No| NicholasCage.|
|1983|   104|    Dead ZoneThe|              Horror|       WalkenChristo|   hAedramsBrooke|  CronenbergDa|       id79|    No| NicholasCage.|
|1979|   122|            Cuba|              Action|         ConnerySean|      AdamsBrooke| LesterRichard|          6|    No| seanConnery.p|
|1978|    94|   Days

In [6]:
df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Length: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Actor: string (nullable = true)
 |-- Actress: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Popularity: string (nullable = true)
 |-- Awards: string (nullable = true)
 |-- Image: string (nullable = true)



# **Q: 01**

In [7]:
rdd = df.select('Title', 'Year', 'Director').filter((df['Genre'] == 'Action') & (df['Awards'] == 'Yes'))

rdd.show()

+-----+----+--------+
|Title|Year|Director|
+-----+----+--------+
+-----+----+--------+



# **Q: 02**

In [8]:
rdd1 = df.select('Actor', 'Title', 'Director').filter(df['Awards'] == 'Yes').groupBy('Actor').agg(collect_list('Title').alias('Movies'), collect_list('Director').alias('Directors'))

rdd1.show(truncate=False)

+--------------+---------------------------------------+----------------------------------------------+
|Actor         |Movies                                 |Directors                                     |
+--------------+---------------------------------------+----------------------------------------------+
|LintDerek De  |[AssaultThe]                           |[RademakersFo]                                |
|LancasterBurt |[Airport, Come BackLittl]              |[eSeatonGeorge, MannDaniel]                   |
|TikhonovVyach |[War & Peace]                          |[BiolandarchukSe]                             |
|BridgesBeau   |[Norma Rae]                            |[RittMartin]                                  |
|LoneJohn      |[Last EmperorT]                        |[BertolucciBern]                              |
|IronsJeremy   |[Reversal of For]                      |[SchroederBarb]                               |
|RichardsonRal |[HeiressThe]                           |[vWiayle

# **Q: 03**

In [9]:
rdd2 = df.select('title').filter(df['Awards'] == 'No').orderBy(df['Popularity'].desc()).limit(10)

rdd2.show()

+--------------+
|         title|
+--------------+
|     Going Ape|
|LadykillersThe|
|Sweet Smell of|
|Angel & the Ba|
|    Bandolero!|
|        Chisum|
|       Ffolkes|
| On Wings of E|
|  Hellfighters|
|  Wild GeeseTh|
+--------------+



# **Q: 04**

In [10]:
rdd3 = df.select('Title').filter(df['Year'] < '1980').orderBy(df['Popularity'].asc()).limit(10)

rdd3.show()

+--------------+
|         Title|
+--------------+
|          NULL|
|          NULL|
|          NULL|
|          NULL|
|          NULL|
|          NULL|
|          NULL|
|White Lightnin|
|          NULL|
|          NULL|
+--------------+



# **Q: 05**

In [11]:
rdd4 = df.select('Genre', 'Length').groupBy('Genre').agg(avg('Length').alias('Average Length'))

rdd4.show()

+--------------------+--------------+
|               Genre|Average Length|
+--------------------+--------------+
|              lDrama|          95.0|
|          Muiytstery|          95.0|
|        eWrneesrtern|          60.0|
|      AScotriocnerer|          90.0|
|             Dutrama|          NULL|
|              Dgrama|         105.0|
| WDaelstotenr nGangT|         180.0|
|D arta tmhea Penalty|         101.0|
|   C &o mToemdyorrow|         118.0|
|      MgeysrsteTrhye|         102.0|
|Cn oinm tehdey Avoca|          90.0|
|           tCi omedy|         112.0|
|              fDrama|          88.0|
|            Meystery|         119.0|
| C Boamcekd iny Laug|          55.0|
|Wstesrnte GrnreatsRi|          NULL|
|              iWc ar|         111.0|
|           DKrisasma|         104.0|
|        Darrarmiaage|          NULL|
|           aAgcetion|         127.0|
+--------------------+--------------+
only showing top 20 rows



# **Q: 06**

In [12]:
rdd5 = df.select('Actor', 'Actress').filter(df['Genre'] == 'Comedy').groupBy('Actor', 'Actress').count()
rdd6 = rdd5.filter(col('count') > 3)

rdd6.show()

+----------+-----------+-----+
|     Actor|    Actress|count|
+----------+-----------+-----+
|AllenWoody|KeatonDiane|    5|
+----------+-----------+-----+



# **Q: 07**

In [13]:
comedy = df.select('Actor').filter(df['Genre'] == 'Comedy').distinct()
drama = df.select('Actor').filter(df['Genre'] == 'Drama').distinct()
rdd7 = comedy.intersect(drama)

rdd7.show()

+--------------+
|         Actor|
+--------------+
|   WillisBruce|
| EastwoodClint|
|   ConnerySean|
|  TracySpencer|
|    NelsonJudd|
|   MooreDudley|
|    FinchPeter|
|  BrandoMarlon|
| HopkinsHarold|
|    NewmanPaul|
|  BeattyWarren|
|LambertChristo|
|  BoyerCharles|
|  SheenCharlie|
|   CageNicolas|
| HowellC. Thom|
| RedfordRobert|
|     CruiseTom|
|DreyfussRichar|
|    FondaHenry|
+--------------+
only showing top 20 rows



# **Q: 08**

In [14]:
comedy = df.select('Actor').filter(df['Genre'] == 'Comedy').distinct()
drama = df.select('Actor').filter(df['Genre'] == 'Drama').distinct()
rdd8 = comedy.union(drama)

rdd8.show()

+-------------+
|        Actor|
+-------------+
|   BoséMiguel|
|KeatonMichael|
|  WillisBruce|
|  JaglomHenry|
|DallesandroJo|
|  DoranJohnny|
|     LeeKarla|
|  RobertsEric|
|     WaitsTom|
|     CryerJon|
|  MoranisRick|
|EastwoodClint|
|  ConnerySean|
|  BegleyEdJr.|
| CassidyDavid|
|   PiscopoJoe|
| FeldmanMarty|
| TracySpencer|
|BanderasAnton|
|   NelsonJudd|
+-------------+
only showing top 20 rows



# **Q: 09**

In [15]:
comedy = df.select('Actor').filter(df['Genre'] == 'Comedy').distinct()
actors = df.select('Actor').distinct()
rdd9 = actors.subtract(comedy)

rdd9.show()

+---------------+
|          Actor|
+---------------+
|SCtluefefseJohn|
|   CottenJoseph|
|       BrownTom|
|     DillonMatt|
|   LintDerek De|
|  LancasterBurt|
|    RomeroCesar|
|  StockwellDean|
|   JourdanLouis|
|    ManesseGasp|
|  TikhonovVyach|
|   ShimuraTakas|
|    UrichRobert|
|       DavisGuy|
|    BridgesBeau|
|    KattWilliam|
|    BakulaScott|
|      PriceMarc|
|   NaughtonDavi|
|  eDlanielsJeff|
+---------------+
only showing top 20 rows



# **Q: 10**

In [16]:
rdd10 = df.select('Actor', 'Popularity').groupBy('Actor').agg(avg('Popularity').alias('Average Popularity'), max('Popularity').alias('Maximum Popularity'), min('Popularity').alias('Minimum Popularity'))

rdd10.show()

+-------------------+------------------+------------------+------------------+
|              Actor|Average Popularity|Maximum Popularity|Minimum Popularity|
+-------------------+------------------+------------------+------------------+
|               NULL|58.166666666666664|                77|                32|
|'W SimldaertGeer ne|              42.0|                42|                42|
|         AbelAlfred|              49.0|                49|                49|
|       AbrahamF. Mu|               6.0|                 6|                 6|
|      AdolphsonEdvi|              49.0|                72|                26|
|        AherneBrian|              57.0|                57|                57|
|      AhlstedtBörje|              81.0|                81|                81|
|        AielloDanny|              12.5|                 5|                20|
|          AkanTarik|              53.0|                53|                53|
|     AlbaicínRafael|              55.0|            

# **Q: 11**

In [17]:
rdd11 = df.select('Year').filter(df['Year'] >= '1960').groupBy('Year').count()

rdd11.show()

+---------------+-----+
|           Year|count|
+---------------+-----+
|           1987|   99|
| JackNicholson.|    1|
|           1972|   23|
|              g|   62|
|           1988|   94|
|           1977|   28|
| burtLancaster.|    1|
|elizabethTaylor|    1|
|           1971|   23|
|           1984|   38|
|           1982|   35|
|           1965|   11|
|           1962|   14|
|          s.png|   12|
|         rn.png|    8|
|           1981|   22|
| merylStreep.pn|    1|
|           1978|   17|
|           1974|   23|
|           1964|   13|
+---------------+-----+
only showing top 20 rows



# **Q: 12**

In [18]:
rdd12 = df.select('Year').groupBy('Year').count()

rdd12.show()

+---------------+-----+
|           Year|count|
+---------------+-----+
|           1953|   16|
|           1957|   16|
|           1987|   99|
|           1956|   15|
|           1936|    6|
|           1958|   11|
| JackNicholson.|    1|
|           1943|    8|
|           1972|   23|
|              g|   62|
|           1931|    9|
|           1988|   94|
|           1926|    4|
|           1938|    8|
|           1932|    8|
|           1977|   28|
| burtLancaster.|    1|
|elizabethTaylor|    1|
|           1971|   23|
|           1984|   38|
+---------------+-----+
only showing top 20 rows



# **Q: 13**

In [19]:
rdd13 = df.select('Year', 'Genre').filter(df['Length'] > 100).groupBy('Year', 'Genre').count()

rdd13.show()

+----+------------------+-----+
|Year|             Genre|count|
+----+------------------+-----+
|1984|            Comedy|    2|
|1958|             Drama|    2|
|1968|             Music|    1|
|1967|             Drama|    1|
|1944|      oWvear Tokyo|    1|
|1987|      iCetonmamedy|    1|
|1990|           Mystery|    2|
|1975|          Ahection|    1|
|1997|  oSncience Fictio|    1|
|1988|nD r&a mHias Dream|    1|
|1988|        Canodmaedy|    1|
|1988|          Donrasma|    1|
|1976|         vAilction|    1|
|1965|          TMhuesic|    1|
|1988|            Action|    2|
|1948|        rWs estern|    1|
|1975|            Action|    2|
|1956| D Araumguast Moon|    1|
|1992|             Drama|   10|
|1969|           Western|    3|
+----+------------------+-----+
only showing top 20 rows



# **Q: 14**

In [20]:
rdd14 = df.select('Title').filter(df['Year'] < 1990).orderBy('Title')

rdd14.show()

+-----------------+
|            Title|
+-----------------+
|    2001: A Space|
|          48 Hrs.|
|   A Big Hand for|
| A Child Is Waiti|
|    A Chorus Line|
|    A Clockwork O|
|   A Coeur Joie(H|
|  A Cry in the Da|
|   A Dry White Se|
|    A Fine Madnes|
|    A Fish Called|
|A Fistful of Doll|
|      A Guy Named|
|   A Lesson in Lo|
| A Little Night M|
|     A Man & a Wo|
|     A Man & a Wo|
|  A Man for All S|
|  A Matter of Tim|
|   A Month in the|
+-----------------+
only showing top 20 rows



# **Q: 15**

In [21]:
rdd15 = df.select('Title').filter(length(df['Title']) > 50)

rdd15.show()

+-----+
|Title|
+-----+
+-----+



In [22]:
spark.stop()