In [1]:
import json
import pandas as pd


In [2]:
import os

json_files = []
for root, dirs, files in os.walk("./train_metadata"):  
    for filename in files:
        json_files.append(filename)

json_files[0:5]

['0008c5398-1.json',
 '0008c5398-2.json',
 '0008c5398-3.json',
 '0008c5398-4.json',
 '0008c5398-5.json']

In [None]:
total = pd.DataFrame()
for file in json_files:
    filename = 'D:/Drexel/LeBow/Python/Project/train_metadata/' + file
    with open(filename, "r", encoding = "utf8") as read_file:
        data = json.load(read_file)
        labels = data['labelAnnotations']
        df2 = pd.DataFrame(labels)
        df2['image'] = file[1:file.find('.')]
        total = total.append(df2)

In [9]:
total.head()

Unnamed: 0,description,mid,score,topicality,image
0,cat,/m/01yrx,0.99437,0.99437,008c5398-1
1,small to medium sized cats,/m/07k6w8,0.92139,0.92139,008c5398-1
2,whiskers,/m/01l7qd,0.917497,0.917497,008c5398-1
3,cat like mammal,/m/0307l,0.897073,0.897073,008c5398-1
4,eye,/m/014sv8,0.800121,0.800121,008c5398-1


In [None]:
total.to_csv("labelAnnotations.csv")

## Another approach - a better one

In [4]:
import pandas as pd
import json
from pandas.io.json import json_normalize

In [4]:
with open("../Project/train_metadata/000a290e4-1.json", "r") as f:
    data = json.load(f)
    
type(data)

dict

In [5]:
df = json_normalize(data)
df.head()

Unnamed: 0,cropHintsAnnotation.cropHints,imagePropertiesAnnotation.dominantColors.colors,labelAnnotations
0,"[{'boundingPoly': {'vertices': [{}, {'x': 359}...","[{'color': {'red': 155, 'green': 112, 'blue': ...","[{'mid': '/m/0bt9lr', 'description': 'dog', 's..."


### Here we see three columns each containing nested json. We will try to get individual json structures in structured formats and then combine each

In [7]:
df1 = json_normalize(data = data['labelAnnotations'])
df1.head()

Unnamed: 0,description,mid,score,topicality
0,dog,/m/0bt9lr,0.964141,0.964141
1,dog breed,/m/0kpmf,0.941975,0.941975
2,dog like mammal,/m/01z5f,0.92154,0.92154
3,dog breed group,/m/02xl47d,0.899459,0.899459
4,phalÃ¨ne,/m/0393qn,0.717898,0.717898


In [11]:
df2 = json_normalize(data = data['imagePropertiesAnnotation']['dominantColors']['colors'])
df2.head()

Unnamed: 0,color.blue,color.green,color.red,pixelFraction,score
0,91,112,155,0.015744,0.138713
1,129,146,165,0.196276,0.072678
2,21,23,35,0.008152,0.061905
3,56,75,109,0.011588,0.129732
4,34,47,76,0.008711,0.111193


In [22]:
df3 = json_normalize(data = data['cropHintsAnnotation']['cropHints'], record_path=['boundingPoly'], meta=['confidence', 'importanceFraction'])
df3.head()

Unnamed: 0,0,confidence,importanceFraction
0,vertices,0.8,1


The column 'score' appears twice - first in df1 and then in df2. Needs to be taken care

In [23]:
df2['colorScore'] = df2['score']
del df2['score']

In [27]:
df2.head()

Unnamed: 0,color.blue,color.green,color.red,pixelFraction,colorScore
0,91,112,155,0.015744,0.138713
1,129,146,165,0.196276,0.072678
2,21,23,35,0.008152,0.061905
3,56,75,109,0.011588,0.129732
4,34,47,76,0.008711,0.111193


In [31]:
df2 = df2.rename(columns = {'colorScore': 'colorscore'})
df2.head()

Unnamed: 0,color.blue,color.green,color.red,pixelFraction,colorscore
0,91,112,155,0.015744,0.138713
1,129,146,165,0.196276,0.072678
2,21,23,35,0.008152,0.061905
3,56,75,109,0.011588,0.129732
4,34,47,76,0.008711,0.111193


### Combine both methods to achieve desired result

In [2]:
import pandas as pd
import json
from pandas.io.json import json_normalize
import os

In [None]:
json_files = []
for root, dirs, files in os.walk("./train_metadata"):  
    for filename in files:
        json_files.append(filename)

json_files[0:5]

In [20]:
labels_df = pd.DataFrame()
colors_df = pd.DataFrame()
for file in json_files[1:200]:
    filename = 'D:/Drexel/LeBow/Python/Project/train_metadata/' + file
    with open(filename, "r", encoding = "utf8") as read_file:
        data = json.load(read_file)
        if "labelAnnotations" in data.keys():
            df1 = json_normalize(data = data['labelAnnotations'])
        if "imagePropertiesAnnotation" in data.keys():        
            df2 = json_normalize(data = data['imagePropertiesAnnotation']['dominantColors']['colors'])
        
        df1['image'] = df2['image'] = file[1:file.find(".")]
        labels_df = labels_df.append(df1)
        colors_df = colors_df.append(df2)

In [21]:
labels_df.head()

Unnamed: 0,description,mid,score,topicality,image
0,cat,/m/01yrx,0.993564,0.993564,008c5398-2
1,small to medium sized cats,/m/07k6w8,0.921133,0.921133,008c5398-2
2,cat like mammal,/m/0307l,0.888296,0.888296,008c5398-2
3,whiskers,/m/01l7qd,0.845597,0.845597,008c5398-2
4,domestic short haired cat,/m/012c9l,0.79756,0.79756,008c5398-2


In [14]:
colors_df.head()

Unnamed: 0,color.blue,color.green,color.red,pixelFraction,score,image
0,149,159,167,0.123551,0.201247,008c5398-2
1,35,75,120,0.027811,0.126252,008c5398-2
2,85,109,154,0.014785,0.088672,008c5398-2
3,15,18,24,0.261488,0.024348,008c5398-2
4,101,166,153,0.003117,0.001488,008c5398-2


In [1]:
labels_df.to_csv("labelAnnotations.csv", sep=",")
colors_df.to_csv("colorAnnotations.csv", sep=",")

NameError: name 'labels_df' is not defined

In [None]:
json_files = []
for root, dirs, files in os.walk("./train_metadata"):  
    for filename in files:
        json_files.append(filename)

json_files[0:5]