<a href="https://colab.research.google.com/github/nataliakartasheva/AMD_MBA/blob/master/AMD_MBA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Connect to kaggle and download datasets to drive memory

In [None]:
! pip install kaggle
! pip install pyspark

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

from google.colab import files
uploaded = files.upload()

! mkdir /content/drive/MyDrive/Kaggle

import shutil
shutil.move('/content/kaggle.json', '/content/drive/MyDrive/Kaggle/kaggle.json')

import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/Kaggle'

In [None]:
!kaggle datasets download -d ashirwadsangwan/imdb-dataset 
!unzip imdb-dataset.zip

Init spark session and download datasets to local memory

In [1]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, ArrayType, DoubleType
from pyspark.sql.functions import udf, col, count, collect_set, lit, when, concat_ws, substring, length, col, expr

import seaborn as sns
import matplotlib.pyplot as plt

import itertools
import time
from datetime import time, date, datetime

import pandas as pd
import numpy as np
from wordcloud import WordCloud

In [2]:
conf = SparkConf().setAppName("MarketBasketAnalysis")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '16G')
        .set('spark.driver.memory', '60G')
        .set('spark.driver.maxResultSize', '12G'))
sc = SparkContext(conf= conf)
spark = SparkSession.builder.appName("Market Basekt Analysis").getOrCreate()

In [3]:
spark

In [4]:
principals_path = '/content/title.principals.tsv.gz'
movies_path = '/content/title.basics.tsv.gz'
persons_path = '/content/name.basics.tsv.gz'

principals = spark.read.csv(principals_path, sep='\t', header=True, inferSchema=True)
movies = spark.read.csv(movies_path, sep= '\t', header=True, inferSchema=True)
persons = spark.read.csv(persons_path, sep='\t' , header=True, inferSchema=True)

Exploration data analyses

In [5]:
principals.show(5)

+---------+--------+---------+---------------+--------------------+-----------+
|   tconst|ordering|   nconst|       category|                 job| characters|
+---------+--------+---------+---------------+--------------------+-----------+
|tt0000001|       1|nm1588970|           self|                  \N|["Herself"]|
|tt0000001|       2|nm0005690|       director|                  \N|         \N|
|tt0000001|       3|nm0374658|cinematographer|director of photo...|         \N|
|tt0000002|       1|nm0721526|       director|                  \N|         \N|
|tt0000002|       2|nm1335271|       composer|                  \N|         \N|
+---------+--------+---------+---------------+--------------------+-----------+
only showing top 5 rows



In [7]:
movies.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            \N|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [8]:
persons.show(5)

+---------+---------------+---------+---------+--------------------+--------------------+
|   nconst|    primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+---------------+---------+---------+--------------------+--------------------+
|nm0000001|   Fred Astaire|     1899|     1987|soundtrack,actor,...|tt0050419,tt00531...|
|nm0000002|  Lauren Bacall|     1924|     2014|  actress,soundtrack|tt0117057,tt00373...|
|nm0000003|Brigitte Bardot|     1934|       \N|actress,soundtrac...|tt0049189,tt00599...|
|nm0000004|   John Belushi|     1949|     1982|actor,writer,soun...|tt0078723,tt00804...|
|nm0000005| Ingmar Bergman|     1918|     2007|writer,director,a...|tt0050986,tt00839...|
+---------+---------------+---------+---------+--------------------+--------------------+
only showing top 5 rows

