In [1]:
from inspect import signature
import pandas as pd

## Part 2. Various data formats

### There are three approaches to acquiring data
1. **api** - a protocol developed by the data supplier, allows to retrieve data incrementally and lazily
2. **dump** - a block of data (typically from an enthusiast) often with compatibility issues
3. **scraping** - an algorithm to retrieve (sometimes protected) data

before scraping you should always try to find 1 or 2, it often works

## 2.1 Consider the imdb api

In [2]:
#!pip install IMDbPY
import imdb

Collecting IMDbPY
  Downloading IMDbPY-2021.4.18-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 1.6 MB/s eta 0:00:01
[?25hCollecting SQLAlchemy
  Downloading SQLAlchemy-1.4.23-cp38-cp38-macosx_10_14_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 6.3 MB/s eta 0:00:01
Collecting greenlet!=0.4.17
  Downloading greenlet-1.1.1-cp38-cp38-macosx_10_14_x86_64.whl (87 kB)
[K     |████████████████████████████████| 87 kB 5.3 MB/s eta 0:00:011
[?25hInstalling collected packages: greenlet, SQLAlchemy, IMDbPY
Successfully installed IMDbPY-2021.4.18 SQLAlchemy-1.4.23 greenlet-1.1.1
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [3]:
print(imdb.__doc__)


This package can be used to retrieve information about a movie or a person
from the IMDb database. It can fetch data through different media such as
the IMDb web pages, or a SQL database.



### Somewhere I found this example

In [4]:
movie_name = "matrix"
ia = imdb.IMDb()
movies = ia.search_movie(movie_name)

In [5]:
%whos

Variable     Type                    Data/Info
----------------------------------------------
ia           IMDbHTTPAccessSystem    <imdb.parser.http.IMDbHTT<...>object at 0x7f8a58fcd760>
imdb         module                  <module 'imdb' from '/Lib<...>ckages/imdb/__init__.py'>
movie_name   str                     matrix
movies       list                    n=20
pd           module                  <module 'pandas' from '/L<...>ages/pandas/__init__.py'>
signature    function                <function signature at 0x7f8a5585dca0>


In [6]:
signature(ia.search_movie)

<Signature (title, results=None, _episodes=False)>

In [7]:
movie = movies[0]
movie

<Movie id:0133093[http] title:_The Matrix (1999)_>

In [8]:
%whos

Variable     Type                    Data/Info
----------------------------------------------
ia           IMDbHTTPAccessSystem    <imdb.parser.http.IMDbHTT<...>object at 0x7f8a58fcd760>
imdb         module                  <module 'imdb' from '/Lib<...>ckages/imdb/__init__.py'>
movie        Movie                   The Matrix
movie_name   str                     matrix
movies       list                    n=20
pd           module                  <module 'pandas' from '/L<...>ages/pandas/__init__.py'>
signature    function                <function signature at 0x7f8a5585dca0>


In [9]:
print([x for x in dir(movie) if '_' not in x])

['accessSystem', 'asXML', 'charactersRefs', 'clear', 'cmpFunct', 'copy', 'currentRole', 'data', 'get', 'getAsXML', 'getID', 'guessLanguage', 'infoset2keys', 'isSame', 'isSameMovie', 'isSameTitle', 'items', 'iteritems', 'iterkeys', 'itervalues', 'key2infoset', 'keys', 'modFunct', 'movieID', 'myID', 'myTitle', 'namesRefs', 'notes', 'pop', 'popitem', 'reset', 'roleID', 'setdefault', 'smartCanonicalTitle', 'summary', 'titlesRefs', 'update', 'values']


In [10]:
print(movie.summary())

Movie
=====
Title: Matrix, The (1999)



In [11]:
movie.keys()

['title',
 'kind',
 'year',
 'cover url',
 'canonical title',
 'long imdb title',
 'long imdb canonical title',
 'smart canonical title',
 'smart long imdb canonical title',
 'full-size cover url']

In [12]:
movie.values()

['The Matrix',
 'movie',
 1999,
 'https://m.media-amazon.com/images/M/MV5BNzQzOTk3OTAtNDQ0Zi00ZTVkLWI0MTEtMDllZjNkYzNjNTc4L2ltYWdlXkEyXkFqcGdeQXVyNjU0OTQ0OTY@._V1_UX32_CR0,0,32,44_AL_.jpg',
 'Matrix, The',
 'The Matrix (1999)',
 'Matrix, The (1999)',
 'Matrix, The',
 'Matrix, The (1999)',
 'https://m.media-amazon.com/images/M/MV5BNzQzOTk3OTAtNDQ0Zi00ZTVkLWI0MTEtMDllZjNkYzNjNTc4L2ltYWdlXkEyXkFqcGdeQXVyNjU0OTQ0OTY@.jpg']

In [13]:
movie.data

{'title': 'The Matrix',
 'kind': 'movie',
 'year': 1999,
 'cover url': 'https://m.media-amazon.com/images/M/MV5BNzQzOTk3OTAtNDQ0Zi00ZTVkLWI0MTEtMDllZjNkYzNjNTc4L2ltYWdlXkEyXkFqcGdeQXVyNjU0OTQ0OTY@._V1_UX32_CR0,0,32,44_AL_.jpg'}

### Here is another example

In [14]:
movie = ia.get_movie('0133093')

In [15]:
signature(ia.get_movie)

<Signature (movieID, info=('main', 'plot'), modFunct=None)>

In [16]:
print([x for x in dir(movie) if '_' not in x])

['accessSystem', 'asXML', 'charactersRefs', 'clear', 'cmpFunct', 'copy', 'currentRole', 'data', 'get', 'getAsXML', 'getID', 'guessLanguage', 'infoset2keys', 'isSame', 'isSameMovie', 'isSameTitle', 'items', 'iteritems', 'iterkeys', 'itervalues', 'key2infoset', 'keys', 'modFunct', 'movieID', 'myID', 'myTitle', 'namesRefs', 'notes', 'pop', 'popitem', 'reset', 'roleID', 'setdefault', 'smartCanonicalTitle', 'summary', 'titlesRefs', 'update', 'values']


In [17]:
print(movie.keys())

['localized title', 'original title', 'cast', 'genres', 'runtimes', 'countries', 'country codes', 'language codes', 'color info', 'aspect ratio', 'sound mix', 'box office', 'certificates', 'original air date', 'rating', 'votes', 'cover url', 'imdbID', 'plot outline', 'languages', 'title', 'year', 'kind', 'directors', 'writers', 'producers', 'composers', 'cinematographers', 'editors', 'editorial department', 'casting directors', 'production designers', 'art directors', 'set decorators', 'costume designers', 'make up department', 'production managers', 'assistant directors', 'art department', 'sound department', 'special effects', 'visual effects', 'stunts', 'camera department', 'animation department', 'casting department', 'costume departmen', 'location management', 'music department', 'script department', 'transportation department', 'miscellaneous', 'akas', 'writer', 'director', 'top 250 rank', 'production companies', 'distributors', 'special effects companies', 'other companies', 'pl

In [18]:
from pprint import pprint

In [19]:
signature(pprint)

<Signature (object, stream=None, indent=1, width=80, depth=None, *, compact=False, sort_dicts=True)>

In [20]:
pprint(movie.data, width = 140, compact = True)

{'akas': ['Matrix (Japan, English title)', 'Matrix (France)', 'Matrix (Germany)', 'Matrix (Spain)', '黑客帝国 (China, Mandarin title)'],
 'animation department': [<Person id:0863078[http] name:_Trevor Tighe_>],
 'art department': [<Person id:0054311[http] name:_Tony Bardolph_>, <Person id:0058785[http] name:_Brett Bartlett_>,
                    <Person id:0072059[http] name:_Shane Bennett_>, <Person id:0166744[http] name:_Jake Clifton_>,
                    <Person id:0170556[http] name:_Godric Cole_>, <Person id:0171861[http] name:_Peter Collias_>,
                    <Person id:0177134[http] name:_Jules Cook_>, <Person id:0185062[http] name:_James Cox_>,
                    <Person id:0201685[http] name:_Geofrey Darrow_>, <Person id:0263016[http] name:_Marianne Evans_>,
                    <Person id:2040855[http] name:_Marc Fambro_>, <Person id:0286082[http] name:_Trish Foreman_>,
                    <Person id:0309745[http] name:_Mark Gatt_>, <Person id:0331704[http] name:_Murray Goss

In [21]:
writers = [x['name'] for x in movie['writers']]
writers

['Lilly Wachowski', 'Lana Wachowski']

In [22]:
producers = [x['name'] for x in movie['producers']]
producers

['Bruce Berman',
 'Dan Cracchiolo',
 'Carol Hughes',
 'Andrew Mason',
 'Richard Mirisch',
 'Barrie M. Osborne',
 'Joel Silver',
 'Erwin Stoff',
 'Lana Wachowski',
 'Lilly Wachowski']

In [23]:
vis = [x['name'] for x in movie['visual effects']]
vis

['Jeff Allen',
 'Charlie Armstrong',
 'Al Arthur',
 'Gil Baron',
 'Jeremy Beadell',
 'Roy Berkowitz',
 'Maureen Blume',
 'George Borshukov',
 'John Breslin',
 'Steve Burg',
 'Mark Burns',
 'Allen Cappuccilli',
 'Elizabeth Carlon',
 'Lynne Cartwright',
 'Robin Cave',
 'Daniele Colajacomo',
 'J.D. Cowles',
 'Tim Crosbie',
 'Kate Crossley',
 'Charles Darby',
 'Art David',
 'Tom Davies',
 'Steve Demers',
 'Peter Doyle',
 'David Dulac',
 'Grant Everett',
 'Matt Farell',
 'Matthew Ferro',
 'Lindsay Fleay',
 'Rebecca Fox',
 'John Gaeta',
 'Diana Giorgiutti',
 'Sally Goldberg',
 'Nico Grey',
 'Ben Gunsberger',
 'Laura Hanigan',
 'Brent Hartshorn',
 'Naomi Hatchman',
 'Michael Hemschoot',
 'Charles Henrich',
 'David Hodson',
 'Rodney Iwashina',
 'Jay Mark Johnson',
 'Krista Jordan',
 'Daniel Klem',
 'Ivo Kos',
 'Alisoun F. Lamb',
 'Maryanne Lauric',
 'John Lee',
 'Mary Leitz',
 'Kim Libreri',
 'Joseph Littlejohn',
 'Sophia S. Longoria',
 'Stephen Lunn',
 'Jane Maguire',
 'Anthony Mark Viverito'

### How to store this kind of data
- text or xml (under 10,000 observations)
- binary (under 1000,000 observations)
- sql
- nosql

In [24]:
from pprint import pprint

my_json = {}

my_json['original title'] = my_movie['original title']

my_json['directors']=[]
for director in my_movie['directors']:
    my_json['directors'].append(director['name'])
    
my_json['genres']=[]
for genre in my_movie['genres']:
    my_json['genres'].append(genre)
    
my_json['cast']=[]
for actor in my_movie['cast']:
    my_json['cast'].append({'name':actor['name'], 'long imdb name':actor['long imdb name']})
    
pprint(my_json)

NameError: name 'my_movie' is not defined

In [109]:
import json

with open('sample_1.json', 'w') as file:
    file.write(json.dumps(my_json))
    
print(json.dumps(my_json))
    
with open('sample_1.json', 'r') as file:
    new_json = json.loads(file.read())

{"original title": "\u041c\u0430\u0442\u0440\u0438\u0446\u0430 (1999)", "directors": ["Lana Wachowski", "Lilly Wachowski"], "genres": ["Action", "Sci-Fi"], "cast": [{"name": "Keanu Reeves", "long imdb name": "Keanu Reeves"}, {"name": "Laurence Fishburne", "long imdb name": "Laurence Fishburne"}, {"name": "Carrie-Anne Moss", "long imdb name": "Carrie-Anne Moss"}, {"name": "Hugo Weaving", "long imdb name": "Hugo Weaving"}, {"name": "Gloria Foster", "long imdb name": "Gloria Foster"}, {"name": "Joe Pantoliano", "long imdb name": "Joe Pantoliano"}, {"name": "Marcus Chong", "long imdb name": "Marcus Chong"}, {"name": "Julian Arahanga", "long imdb name": "Julian Arahanga"}, {"name": "Matt Doran", "long imdb name": "Matt Doran"}, {"name": "Belinda McClory", "long imdb name": "Belinda McClory"}, {"name": "Anthony Ray Parker", "long imdb name": "Anthony Ray Parker"}, {"name": "Paul Goddard", "long imdb name": "Paul Goddard"}, {"name": "Robert Taylor", "long imdb name": "Robert Taylor"}, {"name"

In [110]:
import pickle

with open('sample_1.pkl', 'wb') as file:
    file.write(pickle.dumps(my_json))
    
print(pickle.dumps(my_json))
    
with open('sample_1.pkl', 'rb') as file:
    new_json = pickle.loads(file.read())

b'\x80\x03}q\x00(X\x0e\x00\x00\x00original titleq\x01X\x15\x00\x00\x00\xd0\x9c\xd0\xb0\xd1\x82\xd1\x80\xd0\xb8\xd1\x86\xd0\xb0 (1999)q\x02X\t\x00\x00\x00directorsq\x03]q\x04(X\x0e\x00\x00\x00Lana Wachowskiq\x05X\x0f\x00\x00\x00Lilly Wachowskiq\x06eX\x06\x00\x00\x00genresq\x07]q\x08(X\x06\x00\x00\x00Actionq\tX\x06\x00\x00\x00Sci-Fiq\neX\x04\x00\x00\x00castq\x0b]q\x0c(}q\r(X\x04\x00\x00\x00nameq\x0eX\x0c\x00\x00\x00Keanu Reevesq\x0fX\x0e\x00\x00\x00long imdb nameq\x10h\x0fu}q\x11(h\x0eX\x12\x00\x00\x00Laurence Fishburneq\x12h\x10h\x12u}q\x13(h\x0eX\x10\x00\x00\x00Carrie-Anne Mossq\x14h\x10h\x14u}q\x15(h\x0eX\x0c\x00\x00\x00Hugo Weavingq\x16h\x10h\x16u}q\x17(h\x0eX\r\x00\x00\x00Gloria Fosterq\x18h\x10h\x18u}q\x19(h\x0eX\x0e\x00\x00\x00Joe Pantolianoq\x1ah\x10h\x1au}q\x1b(h\x0eX\x0c\x00\x00\x00Marcus Chongq\x1ch\x10h\x1cu}q\x1d(h\x0eX\x0f\x00\x00\x00Julian Arahangaq\x1eh\x10h\x1eu}q\x1f(h\x0eX\n\x00\x00\x00Matt Doranq h\x10h u}q!(h\x0eX\x0f\x00\x00\x00Belinda McCloryq"h\x10h"u}q#(h\x0eX\x1

In [116]:
!du -sh *

4.0K	Lecture 0 online.ipynb
192K	Lecture 0_1.ipynb
124K	Lecture 0_2.ipynb
 32K	Lecture 1_1.ipynb
 24K	Lecture 1_2.ipynb
 44K	Lecture 1_3.ipynb
4.2M	Mamedli_lecture1.pptx
605M	data.tsv
8.0K	sample_1.db
4.0K	sample_1.json
4.0K	sample_1.pkl


### what is sql
SQL is like a dataframe but it does not have to fit into memory, queries are processed incrementally.

the price for this is uglier interface through special queries like "SELECT * from table"

for medium-size datasets (~10Gb fit on hard drive) I personally use **sqlite**

In [136]:
import sqlite3

In [137]:
conn = sqlite3.connect('sample_1.db')
c = conn.cursor()

c.execute("DROP TABLE IF EXISTS movies")
c.execute("CREATE TABLE movies (name text, directors text, cast text, genres text)")

c.execute("INSERT INTO movies VALUES ('Terminator','James Cameron',NULL,NULL)")
c.execute("INSERT INTO movies VALUES ('Елки',NULL,NULL,NULL)")
c.execute("INSERT INTO movies VALUES ('Матрица (1999)','Lana Wachowski & Lilly Wachowski','Action & Sci-Fi',NULL)")

conn.commit()
conn.close()

In [138]:
data = []

In [139]:
conn = sqlite3.connect('sample_1.db')

c = conn.cursor()
c.execute("SELECT * from movies")

for f in c.fetchall():
    data.append(f)

conn.commit()
conn.close()

In [140]:
pd.DataFrame(data, columns = ['name', 'directors', 'cast', 'genres'])

Unnamed: 0,name,directors,cast,genres
0,Terminator,James Cameron,,
1,Елки,,,
2,Матрица (1999),Lana Wachowski & Lilly Wachowski,Action & Sci-Fi,


### what is nosql

In [32]:
!brew install mongodb-community@5.0

zsh:1: command not found: brew


In [28]:
!brew services start mongodb-community@5.0

zsh:1: command not found: brew


In [174]:
from pymongo import MongoClient
client = MongoClient()

In [175]:
client.drop_database('tutorial')
client.list_database_names()

['admin', 'config', 'local']

In [176]:
db = client['tutorial']
collection = db['movies']
collection.insert_one(my_json)

<pymongo.results.InsertOneResult object at 0x7fcbbc8e5460>

In [177]:
result = client.tutorial.movies.find()

In [178]:
next(result)

{'_id': ObjectId('61371d8ca85757636a7b04fe'), 'original title': 'Матрица (1999)', 'directors': ['Lana Wachowski', 'Lilly Wachowski'], 'genres': ['Action', 'Sci-Fi'], 'cast': [{'name': 'Keanu Reeves', 'long imdb name': 'Keanu Reeves'}, {'name': 'Laurence Fishburne', 'long imdb name': 'Laurence Fishburne'}, {'name': 'Carrie-Anne Moss', 'long imdb name': 'Carrie-Anne Moss'}, {'name': 'Hugo Weaving', 'long imdb name': 'Hugo Weaving'}, {'name': 'Gloria Foster', 'long imdb name': 'Gloria Foster'}, {'name': 'Joe Pantoliano', 'long imdb name': 'Joe Pantoliano'}, {'name': 'Marcus Chong', 'long imdb name': 'Marcus Chong'}, {'name': 'Julian Arahanga', 'long imdb name': 'Julian Arahanga'}, {'name': 'Matt Doran', 'long imdb name': 'Matt Doran'}, {'name': 'Belinda McClory', 'long imdb name': 'Belinda McClory'}, {'name': 'Anthony Ray Parker', 'long imdb name': 'Anthony Ray Parker'}, {'name': 'Paul Goddard', 'long imdb name': 'Paul Goddard'}, {'name': 'Robert Taylor', 'long imdb name': 'Robert Taylor'

In [179]:
!brew services stop mongodb-community@5.0

Stopping `mongodb-community`... (might take a while)
[34m==>[0m [1mSuccessfully stopped `mongodb-community` (label: homebrew.mxcl.mongodb-commu[0m
