In [None]:
# reading a text file
filename = 'huck_finn.txt'
file = open(filename, mode='r')  # 'r' is to read
text = file.read()
file.close()

file = open(filename, mode='w')     # 'w' is to write

# Context manager with
with open('huck_finn.txt', mode='r') as file:
    print(file.read())

In [None]:
# Customizing your Numpy import
import numpy as np
filename = 'MNIST_header.txt'
data = np.loadtxt(filename, delimiter=',', skiprows=1, usecols=[0, 2], dtype='str')
print(data)

# np.recfromcsv() - for csv files

Manipulating pandas DataFrames
- Exploratory data analysis
- Data wrangling
- Data preprocessing
- Building models
- Visualization
- Standard and best practice to use pandas

In [None]:
# Importing using pandas
import pandas as pd
filename = 'winequality-red.csv'
data = pd.read_csv(filename)
data.head()
data_array = data.values

- rb - read only in binary

In [None]:
# Pickled files
import pickle
with open('pickled_fruit.pkl', 'rb') as file:
    data = pickle.load(file)
print(data)

In [None]:
import pandas as pd
file = 'urbanpop.xlsx'
data = pd.ExcelFile(file)
print(data.sheet_names)
df1 = data.parse('1960-1966')  # sheet name, as a string
df2 = data.parse(0)  # sheet index, as a float

SAS and Stata files
- SAS: Statistical Analysis System
- Stata: "Statistics" + "data"
- SAS: business analytics and biostatistics
- Stata: academic social sciences research

In [None]:
# Importing SAS file
import pandas as pd
from sas7bdat import SAS7BDAT
with SAS7BDAT('urban.sas7bdat') as file:
    df_sas = file.to_data_frame()

In [None]:
# Importing Stata files
import pandas as pd
data = pd.read_stata('urbanpop.dta')

HDF5 files
- Hierarchical Data Format version 5
- Standard for storing large quantities of numerical data
- Datasets can be hundreds of gigabytes or terabytes
- HDF5 can scale to exabytes

In [None]:
# Importing HDF5 file
import h5py
filename = 'H-Hi_LOSC_4_V1-815411200-4096.hdf5'
data = h5py.File(filename, 'r')  # 'r' is to read
print(type(data))

# structure of HDF5 file
for key in data.keys():
    print(key)

for key in data['meta'].keys():
    print(key)
print(np.array(data['meta']['Description']), np.array(data['meta']['Detector']))


SciPy to load matlab data
- scipy.io.loadmat() - read .mat files
- scipy.io.savemat() - write .mat files

In [None]:
# Importing a .mat file
import scipy.io
filename = 'workspace.mat'
mat = scipy.io.loadmat(filename)
print(type(mat))

Creating a database engine
- SQLite database
    - Fast and simple
- SQLAlchemy
    - Works with many Relational Database Management Systems

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///Northwind.sqlite')

# Getting table names
tables_name = engine.table_names()
print(tables_name)

Workflow of SQL querying
- Import packages and functions
- Create the database engine
- Connect to the engine
- Query the database
- Save query results to a DataFrame
- Close the connection

In [None]:
# First query
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('sqlite:///Northwind.sqlite')
con = engine.connect()
rs = con.execute('SELECT * FROM Orders')
df = pd.DataFrame(rs.fetchall())
df.columns = rs.keys()
con.close()
print(df.head())

In [None]:
# Using the context manager
# no need to close the connection
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('sqlite:///Northwind.sqlite')

with engine.connect() as con:
    rs = con.execute('SELECT OrderID, OrderDate, ShipName FROM Orders')
    df = pd.DataFrame(rs.fetchmany(size=5)) # import 5 rows instead of all
    df.columns = rs.keys()

In [None]:
# The pandas way to query
import pandas as pd
df = pd.read_sql_query('SELECT * FROM Orders', engine)

In [None]:
# INNER JOIN in Python
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('sqlite:///Northwind.sqlite')
df = pd.read_sql_query('SELECT OrderID, CompanyName FROM Orders INNER JOIN Customers on Orders.CustomerID = Customers.CustomerID', engine) 
print(df.head())