In [8]:
# Working with json 
# Python natively supports json as dictionary

import json 
import pandas as pd 
data = {
    "president": {
        "name": "Zaphod Beeblebrox",
        "species": "Betelgeusian"
    }
}

# json.dump is used to push json object into a file 
with open("data_file.json", "w") as write_file:
    json.dump(data, write_file)

with open("data_file.json", "r") as read_file:
    data = json.load(read_file)

print(data) 
print(type(data))


# directly importing json to pandas dataframe

jsonStr = '''{"Index0":{"Courses": "Pandas","Discount": "1200"},
           "Index1":{"Courses": "Hadoop","Discount": "1500"},
           "Index2":{"Courses": "Spark","Discount": "1800"}
          }'''

# Convert JSON to DataFrame Using read_json()
df2 = pd.read_json(jsonStr, orient ='index')
print(df2)

# convert dictionary to dataframe 
data['president'] 
df3 = pd.DataFrame.from_dict(data, orient ='index')

{'president': {'name': 'Zaphod Beeblebrox', 'species': 'Betelgeusian'}}
<class 'dict'>
       Courses  Discount
Index0  Pandas      1200
Index1  Hadoop      1500
Index2   Spark      1800


In [None]:
# working with csv 
df = pd.read_csv('data/hrdata.csv', index_col='Name')
df = pd.read_csv('data/hrdata.csv', index_col='Name', parse_dates=['Hire Date']) # reading a csv file
df.to_csv('data/hrdata_modified.csv') # writing to a csv file 

In [None]:
# working with xlsx 
# install openpyxl : pip install openpyxl

from openpyxl import Workbook

workbook = Workbook()
sheet = workbook.active
sheet["A1"] = "hello"
sheet["B1"] = "world!"
workbook.save(filename="hello_world.xlsx")

#Reading excel file
from openpyxl import load_workbook
workbook = load_workbook(filename="data/sample-xlsx-file.xlsx")
workbook.sheetnames['Sheet 1']

# reading xlsx file in pandas
excel_df = pd.read_excel('data/sample-xlsx-file.xlsx')

# writing to xlsx file 
excel_df.to_excel('data/sample-xlsx-file-modifeid.xlsx')

In [None]:
# Working with avro 
# Apache Avro is a data serialization format. We can store data as .avro files on disk.
#Avro files are typically used with Spark but Spark is completely independent of Avro.
#Avro is a row-based format that is suitable for evolving data schemas. 
#One benefit of using Avro is that schema and metadata travels with the data.
# pip install avro-python3

# Python 3 with `avro-python3` package available
import copy
import json
import avro
from avro.datafile import DataFileWriter, DataFileReader
from avro.io import DatumWriter, DatumReader

# Note that we combined namespace and name to get "full name"
schema = {
    'name': 'avro.example.User',
    'type': 'record',
    'fields': [
        {'name': 'name', 'type': 'string'},
        {'name': 'age', 'type': 'int'}
    ]
}

# Parse the schema so we can use it to write the data
schema_parsed = avro.schema.Parse(json.dumps(schema))


# Write data to an avro file
with open('users.avro', 'wb') as f:
    writer = DataFileWriter(f, DatumWriter(), schema_parsed)
    writer.append({'name': 'Pierre-Simon Laplace', 'age': 77})
    writer.append({'name': 'John von Neumann', 'age': 53})
    writer.close()

# Read data from an avro file
with open('users.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    metadata = copy.deepcopy(reader.meta)
    schema_from_file = json.loads(metadata['avro.schema'])
    users = [user for user in reader]
    reader.close()

print(f'Schema that we specified:\n {schema}')
print(f'Schema that we parsed:\n {schema_parsed}')
print(f'Schema from users.avro file:\n {schema_from_file}')
print(f'Users:\n {users}')

# using pandas 
#pip install pandavro
import copy
import json
import pandas as pd
import pandavro as pdx
from avro.datafile import DataFileReader
from avro.io import DatumReader

# Data to be saved
users = [{'name': 'Pierre-Simon Laplace', 'age': 77},
         {'name': 'John von Neumann', 'age': 53}]
users_df = pd.DataFrame.from_records(users)
print(users_df)

pdx.to_avro('data/users_test.avro', users_df)

# Check the schema for "users.avro"
with open('users.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    metadata = copy.deepcopy(reader.meta)
    schema_from_file = json.loads(metadata['avro.schema'])
    reader.close()
print(schema_from_file)
