# ITNPBD2: Representing and Manipulating Data
## University of Stirling
## Dr. Kevin Swingler

# Python Dictionaries
- Data structures indexed by named keys
- Basic structure is an unsorted list of key-value pairs
- Stored as a hashmap for speed of access by key
- Enclosed in `{}` braces

In [43]:
modules = {'ITNPBD1':'Mathematics for Big Data',
        'ITNPBD2':'Representing and Manipulating Data',
        'ITNPBD3':'Relational and Non-Relational Databases'}

print(modules)

values = {"string" : '\"This is an example of a string\"',
        'float' : '1.0', # we can use either '' or "" for both keys and values
         "integer" : "1"}

display(values)

{'ITNPBD1': 'Mathematics for Big Data', 'ITNPBD2': 'Representing and Manipulating Data', 'ITNPBD3': 'Relational and Non-Relational Databases'}


{'string': '"This is an example of a string"', 'float': '1.0', 'integer': '1'}

## Access a single entry by its key

In [44]:
print(modules['ITNPBD2'])

print(modules['ITNPBD3'])

print(values['string'])
print(values['float'])
print(values['integer'])

Representing and Manipulating Data
Relational and Non-Relational Databases
"This is an example of a string"
1.0
1


## List all the keys and values

In [46]:
for mcode, description in modules.items():
    print(f"The module {mcode} is called {description}")
    
for v, w in values.items():
    print(f"Example of a '{v}' could be {w}")

The module ITNPBD1 is called Mathematics for Big Data
The module ITNPBD2 is called Representing and Manipulating Data
The module ITNPBD3 is called Relational and Non-Relational Databases
Example of a 'string' could be "This is an example of a string"
Example of a 'float' could be 1.0
Example of a 'integer' could be 1


## Add a new entry with `[ ]`

In [60]:
modules['ITNPBD4'] = 'Scientific and Commercial Applications of Big Data'

values['character'] = '\'c\''

for key, value in values.items():
    print(f"values = key is {key} : value is {value}")

values = key is string : value is "This is an example of a string"
values = key is float : value is 1.0
values = key is integer : value is 1
values = key is character : value is 'c'


## Loop through just the keys

In [72]:
for mcode in modules:
    print(mcode)
    print(modules[mcode])  # Can also use mcode to access value
        
print("\nPrint just the dictionary's key")
for key in values:
    print(key)

print("\nPrint just the dictionary's values")
for v in values:
    print(values[v])
    
print("\nPrint the dictionary's keys and values")    
for key in values:
    print(key, ":", values[v])

    


ITNPBD1
Mathematics for Big Data
ITNPBD2
Representing and Manipulating Data
ITNPBD3
Relational and Non-Relational Databases
ITNPBD4
Scientific and Commercial Applications of Big Data

Print just the dictionary's key
string
float
integer
character

Print just the dictionary's values
"This is an example of a string"
1.0
1
'c'

Print the dictionary's keys and values
string : "This is an example of a string"
float : 1.0
integer : 1
character : 'c'


## Nesting objects
- Values can be any Python object
- For example lists or dictionaries

In [None]:
modules['ITNPBD6'] = ['Data Mining', 'Machine Learning', 'Data Visualisation']
print(modules['ITNPBD6'])
print(modules['ITNPBD6'][0])

In [None]:
bd6_dict = {'name':'Data Analytics','topics':['Data Mining','Machine Learning','Data Visualisation'],'Lecturer':'Kevin Swingler'}
print(bd6_dict)

In [None]:
modules['ITNPBD6'] = bd6_dict
print(modules['ITNPBD6'])
print(modules['ITNPBD6']['Lecturer'])
print(modules['ITNPBD6']['topics'][1])

# JSON
Dictionaries look a lot like JSON there are some differences:
- JSON is just a string based data representation, but a dictionary is a data structure
- The key in a dictionary is hashed to aid fast in memory access
- The key in JSON must be a string. In a dictionary, it can be any hashable type
- Single quotes in a dictionary, double quotes in JSON
## Main Python methods
- `json.loads()` Produces dictionary from json string
- `json.dumps()` Produces json string from object

In [8]:
import json

json_string = '{"Name":"Kevin", "Age":50}'
dict_obj = {"Name":"Kevin", "Age":50}

dict_from_string = json.loads(json_string)
string_from_dict = json.dumps(dict_obj)

print(json_string, type(json_string))
print(dict_obj, type(dict_obj))
print(dict_from_string, type(dict_from_string))
print(string_from_dict, type(string_from_dict))


{"Name":"Kevin", "Age":50} <class 'str'>
{'Age': 50, 'Name': 'Kevin'} <class 'dict'>
{'Age': 50, 'Name': 'Kevin'} <class 'dict'>
{"Age": 50, "Name": "Kevin"} <class 'str'>


In [None]:
mod_str=json.dumps(modules, indent=2)
print(mod_str)

### To do the same with files, not strings, drop the `s`, so `dump` or `load`

In [9]:
with open('data/MovieData.json') as f:
    movies = json.load(f)

print(movies)



## Here we have loaded an array of json objects about movies
- Let's find all the top level keys

In [10]:
keyset = set()
for mov in movies:
    for k in mov:
        keyset.add(k)
        
print(keyset)

{'writers', 'poster', 'awards', 'runtime', 'year', 'metacritic', 'type', 'actors', 'director', 'genres', 'plot', 'imdb', 'title', 'rated', 'countries', 'tomato'}


In [11]:
display(movies[0])

{'actors': ['Claudia Cardinale',
  'Henry Fonda',
  'Jason Robards',
  'Charles Bronson'],
 'awards': {'nominations': 5, 'text': '4 wins & 5 nominations.', 'wins': 4},
 'countries': ['Italy', 'USA', 'Spain'],
 'director': 'Sergio Leone',
 'genres': ['Western'],
 'imdb': {'id': 'tt0064116', 'rating': 8.6, 'votes': 201283},
 'metacritic': 80,
 'plot': 'Epic story of a mysterious stranger with a harmonica who joins forces with a notorious desperado to protect a beautiful widow from a ruthless assassin working for the railroad.',
 'poster': 'http://ia.media-imdb.com/images/M/MV5BMTEyODQzNDkzNjVeQTJeQWpwZ15BbWU4MDgyODk1NDEx._V1_SX300.jpg',
 'rated': 'PG-13',
 'runtime': 175,
 'title': 'Once Upon a Time in the West',
 'tomato': {'consensus': 'A landmark Sergio Leone spaghetti western masterpiece featuring a classic Morricone score.',
  'fresh': 53,
  'image': 'certified',
  'meter': 98,
  'rating': 9,
  'reviews': 54,
  'userMeter': 95,
  'userRating': 4.3,
  'userReviews': 64006},
 'type': 

## Pick out a single field

In [13]:
print(movies[0]['genres'])

['Western']


## Now pull out all the genres with a similar pattern of code:

In [15]:
genres = set()
for mov in movies:
    genres.add(mov['genres'])  # Won't work - you cannot have a set of lists
        
print(genres)

TypeError: unhashable type: 'list'

## You cannot have a set of mutable lists - must convert to an imutable type, e.g. tuple
- Discuss this for a minute or two!!

In [16]:
genres = set()
for mov in movies:
    genres.add(tuple(mov['genres']))  # Won't work - you cannot have a set of lists
        
print(genres)

{('Action', 'Drama'), ('History',), ('Documentary', 'Biography', 'Crime'), ('Short', 'Family'), ('Comedy', 'Family', 'Sport'), ('Drama', 'Romance', 'War'), ('Documentary', 'War'), ('Comedy', 'Music', 'War'), ('Action', 'Drama', 'Thriller'), ('Comedy', 'Drama', 'Thriller'), ('Adventure', 'Comedy', 'Drama'), ('Mystery',), ('Documentary', 'Short', 'History', 'News', 'War'), ('Horror', 'Comedy'), ('Fantasy', 'Sci-Fi'), ('Crime', 'Drama', 'War'), ('Drama', 'Mystery', 'Sci-Fi'), ('Sport',), ('Action',), ('Adventure', 'Comedy', 'Fantasy'), ('Reality-TV',), ('Drama', 'Comedy', 'Sci-Fi'), ('Family', 'Animation', 'Short'), ('Drama', 'Mystery', 'Romance'), ('Adventure', 'Musical', 'Romance'), ('Crime', 'Mystery', 'Drama'), ('Action', 'Western'), ('Action', 'Sci-Fi'), ('Biography', 'Drama', 'War'), ('Crime', 'Drama', 'Musical'), ('Western', 'Adventure', 'Comedy'), ('Documentary', 'Talk-Show'), ('Adventure', 'Drama'), ('Drama', 'Sci-Fi', 'Thriller'), ('Biography', 'Comedy', 'Crime'), ('Comedy', 'Ac

## Maybe that is not what we really want,
- Try again for a list of genres

In [17]:
genres = set()
for mov in movies:
    for genre in mov['genres']:
        genres.add(genre)
        
print(genres)

{'Crime', 'Mystery', 'Short', 'Documentary', 'Film-Noir', 'Action', 'Thriller', 'History', 'News', 'Horror', 'Reality-TV', 'Sci-Fi', 'Music', 'Western', 'Musical', 'War', 'Comedy', 'Talk-Show', 'Romance', 'Drama', 'Adult', 'Biography', 'Fantasy', 'Family', 'Adventure', 'Sport', 'Game-Show', 'Animation'}


## That is more like it.
- Something more challenging now - find the average rating by genre
- This time we will build a list of dicts of the form `{genrename: {'num':number of examples, 'rating': av rating}}` 

In [34]:
genres = {}
for mov in movies:
    #print(mov['imdb'])
    for genre in mov['genres']:
        if mov['imdb']['rating'] is not None:    # Delete this first to see the problem it is fixing!
            if genre not in genres:
                genres[genre] = {'num':1, 'rating':int(mov['imdb']['rating'])}
            else:
                genres[genre]['num'] += 1
                genres[genre]['rating'] += int(mov['imdb']['rating'])
        
print(genres)
for genre in genres:
    genres[genre]['rating'] = genres[genre]['rating']/genres[genre]['num']
print(genres)

{'News': {'rating': 14, 'num': 2}, 'Crime': {'rating': 1080, 'num': 171}, 'Mystery': {'rating': 281, 'num': 43}, 'Comedy': {'rating': 3877, 'num': 656}, 'Film-Noir': {'rating': 31, 'num': 4}, 'Short': {'rating': 962, 'num': 154}, 'Documentary': {'rating': 1213, 'num': 187}, 'Romance': {'rating': 1336, 'num': 222}, 'Action': {'rating': 1463, 'num': 251}, 'Drama': {'rating': 3897, 'num': 619}, 'Thriller': {'rating': 668, 'num': 117}, 'History': {'rating': 163, 'num': 25}, 'Biography': {'rating': 399, 'num': 60}, 'Horror': {'rating': 568, 'num': 107}, 'Fantasy': {'rating': 522, 'num': 86}, 'Reality-TV': {'rating': 9, 'num': 2}, 'Sci-Fi': {'rating': 552, 'num': 97}, 'Family': {'rating': 681, 'num': 114}, 'Adult': {'rating': 35, 'num': 6}, 'Adventure': {'rating': 1162, 'num': 192}, 'Sport': {'rating': 80, 'num': 14}, 'Music': {'rating': 504, 'num': 74}, 'Western': {'rating': 207, 'num': 33}, 'Animation': {'rating': 582, 'num': 93}, 'Musical': {'rating': 257, 'num': 42}, 'War': {'rating': 20

# Sets, lists and dicts Summary
- You can have a list of dicts
- You can have a list of sets
- Both are mutable
- You cannot have a set of mutable objects like lists or dicts
- You can have a immutable objects like tuples

In [36]:
list_of_dicts = [{'a':1},{'b':2}]
list_of_dicts

[{'a': 1}, {'b': 2}]

In [39]:
list_of_sets = [set((1,2,3)),set((3,3,4))]
list_of_sets

[{1, 2, 3}, {3, 4}]

In [41]:
set_of_dicts = set(({'a':1},{'b':2}))


TypeError: unhashable type: 'dict'

In [43]:
set_of_lists = set(([1, 2, 4],[1,2,3]))


TypeError: unhashable type: 'list'

In [45]:
set_of_tuples = set(((1, 2, 4),(1,2,3)))
set_of_tuples

{(1, 2, 3), (1, 2, 4)}