# Input and Output in Python, Formatting of Strings & Dates

## Libraries and settings

In [None]:
# Libraries
import os
import re
import pytz
import json
import folium
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt

from datetime import date
from datetime import time

from zipfile import ZipFile
from bs4 import BeautifulSoup

from PyPDF2 import PdfReader

from reportlab.lib.units import inch
from reportlab.lib.colors import blue
from reportlab.lib.pagesizes import LETTER
from reportlab.pdfgen.canvas import Canvas

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

## Read & write data from/to a database
Most data driven companies store their data in database management system. <b style="color:yellowgreen">SQLite</b> is a lightweight relational database management system (RDBMS). With python you can connect to a SQLite DB and make requests using SQL.

### Write data to a database

In [None]:
# Create e new db
conn = sqlite3.connect('example_sqlite.db')

# Close connection to db
conn.close()

In [None]:
# Open connection to db
conn = sqlite3.connect('example_sqlite.db')

# Define variables and data types for the (empty) table
conn.execute('''CREATE TABLE IF NOT EXISTS COMPANY
             (ID INT PRIMARY KEY     NOT NULL,
             NAME           TEXT    NOT NULL,
             AGE            INT     NOT NULL,
             CITY        CHAR(50),
             SALARY         REAL);''')

# Read data from a file and write to data frame
data = pd.read_excel("db_data.xlsx", sheet_name = "Sheet1")
print(data)

# Write data to the data base table named 'COMPANY'
data.to_sql('COMPANY', conn, if_exists='replace')

# Commit the changes to the table
conn.commit()

# Close connection to db
conn.close()

### Query the database using SQL and write result to a pandas data frame 

In [None]:
# Connection to db
conn = sqlite3.connect("example_sqlite.db")

# Read data
df_sub = pd.read_sql("SELECT * FROM COMPANY WHERE AGE <= 26", 
                     con=conn,
                     index_col=['index'])
print(df_sub)
    
# Close connection to db
conn.close()

## Read & write data from/to files

### Common data/file formats

Data formats in information technology may refer to:

- Data type, constraint placed upon the interpretation of data in a type system
- Signal (electrical engineering), a format for signal data used in signal processing
- Recording format, a format for encoding data for storage on a storage medium
- <b style="color:yellowgreen">File format, a format for encoding data for storage in a computer file</b>
- Container format (digital), a format for encoding data for storage by means of a standardized audio/video codecs file format
- Content format, a format for representing media content as data
- Audio format, a format for encoded sound data
- Video format, a format for encoded video data

Wikipedia: https://en.wikipedia.org/wiki/Data_format

This section provides common <b style="color:yellowgreen">file formats</b> a data scientist or a data engineer must be aware of. Later, we’ll see how to read these file formats in Python.

List with common file formats explained in this notebook:
- CSV
- TXT
- JSON
- XML
- HTML
- ZIP
- XLSX
- PDF
- Image files (e.g. JPEG)

### CSV (comma separated value)

- A comma-separated values (CSV) file is a delimited text file.
- Each line of the file is a data record.
- Each record consists of one or more fields, separated by a separator (default = comma).
- The use of the comma as a field separator is the source of the name for this file format.
- The seperator can also be user-defined, e.g. you can also use a semicolon instead of a comma.
- A CSV file typically stores tabular data (numbers and text).

In [None]:
# Read data from .csv-file using pandas
data = pd.read_csv("example.csv", sep=";")

# Print the header info of data (first five rows)
print(data.head(5))

# Write data to csv
data.to_csv("example_write.csv", sep=";")

### TXT (plain text)

- In Plain Text file format, everything is written in plain text
- Usually, this text is in unstructured form and there is no meta-data associated with it
- The TXT file format can easily be read by any program

In [None]:
# Open a connection to the text-file
text_file = open("example.txt", 
                 "r", 
                 encoding='utf-8')

# Read data from .txt file
lines = text_file.read()

# Show type
print(type(lines))

# Print the data
print(lines)

In [None]:
# Write data to .txt
lines = ['Dorothy lived in the midst of the great Kansas prairies', 
         'with Uncle Henry, who was a farmer ...']

with open('example_write.txt', 'w') as f:
    f.writelines(lines)
    
# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.txt', f)]
print(files)

### JSON (JavaScript Object Notation)

- JSON is a syntax for storing and exchanging data
- JSON is text, written with JavaScript object notation

In [None]:
# Read data
with open('example.json', 'r') as f:
    data = json.load(f)
print(data)

# Read data to a data frame using the pandas library
data = pd.read_json("example.json")

# Print the data
print(data)

In [None]:
# Write data to .json 
data.to_json('example_write.json')

# Check whether the file exists
files = [f for f in os.listdir('.') if re.match('example_write.json', f)]
print(files)

### XML (extensible markup language)

- XML stands for extensible Markup Language
- XML is a markup language much like HTML
- XML was designed to store and transport data
- XML was designed to be self-descriptive
- XML is a W3C Recommendation

In [None]:
# First option: reading the xml file with BeautifulSoup
bs = BeautifulSoup(open('example.xml'), 'html.parser')
print(bs.prettify())

# Second option: using pandas and convert thr xml file to a data frame
data = pd.read_xml("example.xml")
print("------------------------")
print(data[["name","price"]])

In [None]:
# Second option: reading xml using .read_xml() from pandas
data = pd.read_xml("example.xml")
print(data)

# Write data to .xml
data.to_xml('example_write.xml')

# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.xml', f)]
print(files)

### HTML (hyper text markup language)

- HTML stands for Hyper Text Markup Language
- HTML is the standard markup language for creating Web pages
- HTML describes the structure of a Web page
- HTML consists of a series of elements
- HTML elements tell the browser how to display the content
- HTML elements label pieces of content such as "this is a heading", "this is a paragraph", "this is a link", etc.

In [None]:
# Read data from .html
filename = 'example.html'
html = open(filename, "r").read()
print(html)

In [None]:
# Write data to .html (taking the html-file from above)
with open('example_write.html', 'w') as f:
    f.writelines(html)
    
# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.html', f)]
print(files)

### ZIP (archive file format)

- ZIP is an archive file format that supports lossless data compression
- A ZIP file may contain one or more files or directories that may have been compressed
- The ZIP file format permits a number of compression algorithms, though DEFLATE is the most common
- The name "zip" (meaning "move at high speed") was suggested by R. Mahoney
- They wanted to imply that their product would be faster than ARC and other compression formats of the time

In [None]:
# Pandas supports zip file reads
data = pd.read_csv("archive.zip", sep=";")
data.head(5)

In [None]:
# Create an empty Zip-archive
zipObj = ZipFile('example_write.zip', 'w')

# Add selected files to the zip archive
zipObj.write('example.csv')
zipObj.write('example.html')
zipObj.write('example.json')

# Close the Zip-archive
zipObj.close()

# Check whether zip-file exists
files = [f for f in os.listdir('.') if re.match('example_write.zip', f)]
print(files)

### XLSX (Microsoft Excel Open XML file format)

- It is an XML-based file format created by Microsoft Excel 
- The XLSX format was introduced with Microsoft Office 2007
- In XLSX data is organized under the cells and columns in a sheet
- Each XLSX file may contain one or more sheets
- A single workbook can contain multiple sheets

In [None]:
# Read data from an example .xlsx-file
data = pd.read_excel("example.xlsx", sheet_name = "sheet1")

# Print the data 
data.head(5)

In [None]:
# Write data to xlsx
data.to_excel('example_write.xlsx', sheet_name = "sheet1")

# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.xlsx', f)]
print(files)

### PDF (portable document format)

- PDF is a file format developed by Adobe in the 1990s to present documents, including text formatting and images, in a manner independent of application software, hardware, and operating systems
- Based on the PostScript language, each PDF file encapsulates a complete description of a fixed-layout flat document, including the text, fonts, vector graphics, raster images and other information needed to display it

In [None]:
# Reading metadata
reader = PdfReader("example.pdf")
meta = reader.metadata
print(len(reader.pages))

# All of the following could be None!
print(meta.author)
print(meta.creator)
print(meta.producer)
print(meta.subject)

# Extract text
page = reader.pages[0]
print('\n')
print(page.extract_text())

# Number of pages
print('\n')
print(f'Number of pages in PDF: {len(reader.pages)}')


In [None]:
# Create a canvas
canvas = Canvas("example_write.pdf", pagesize = LETTER)

# Set font to Times New Roman with 36-point size
canvas.setFont("Times-Roman", 36)

# Draw blue text one inch from the left and ten inches from the bottom
canvas.setFillColor(blue)
canvas.drawString(1 * inch, 10 * inch, "This is a PDF file ...")

# Save the PDF file
canvas.save()

# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.pdf', f)]
print(files)

### Image file formats

- Image files consists of pixels and meta-data associated with it
- Usual image files are 3-dimensional, having RGB values
- Image files can also be 2-dimensional (grayscale) or 4-dimensional (having intensity)
- Each image consists one or more frames of pixels
- Each frame is made up of two-dimensional array of pixel values
- Pixel values can be of any intensity 
- Meta-data associated with an image, can be an image type (.png) or pixel dimensions
- The different formats (JPEG, PNG, TIFF, GIF, ...) are used to organize and store digital images in a different way

In [None]:
# Read image
image = plt.imread('example.jpeg')

# Plot image
plt.figure(figsize=(6,4))
plt.imshow(image)

### Spatial data

In [None]:
# Read spatial data
url = ("https://raw.githubusercontent.com/python-visualization/folium/master/examples/data")
state_geo = f"{url}/us-states.json"
state_unemployment = f"{url}/US_Unemployment_Oct2012.csv"
state_data = pd.read_csv(state_unemployment)

# Create choropleth map
m = folium.Map(location=[48, -102], zoom_start=3)
folium.Choropleth(
    geo_data=state_geo,
    name="choropleth",
    data=state_data,
    columns=["State", "Unemployment"],
    key_on="feature.id",
    fill_color="YlGn",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Unemployment Rate (%)",
).add_to(m)

folium.LayerControl().add_to(m)

# Show map
m

# String formatting in Python

## %-formatting

In [None]:
# Example (1):
name = "Eric"
print("Hello, %s." % name)

# Example (2):
name = "Eric"
age = 74
print("Hello, %s. You are %s." % (name, age))

# Example (3):
s1 = "Peter"
s2 = "Mary"
s3 = "%s and %s are living together." % (s1, s2)
s4 = "{} and {} are living together.".format(s1, s2)
print(s3)
print(s4)

# Example (4):
first_name = "Eric"
last_name = "Idle"
age = 74
profession = "comedian"
affiliation = "Monty Python"
print("Hello, %s %s. You are %s. You are a %s. You were a member of %s." %
      (first_name, last_name, age, profession, affiliation))

## str.format()

In [None]:
# Example (1):
name = 'Peter'
age = 45
print("Hello, {}. You are {}.".format(name, age))

# Example (2):
print("Hello, {1}. You are {0}.".format(age, name))

# Example (3):
person = {'name': 'Peter', 'age': 45}
print("Hello, {name}. You are {age}.".format(
    name=person['name'], age=person['age']))

# Example (4):
person = {'name': 'Eric', 'age': 74}
print("Hello, {name}. You are {age}.".format(**person))

# Example (5):
first_name = "Eric"
last_name = "Idle"
age = 74
profession = "comedian"
affiliation = "Monty Python"
print(("Hello, {first_name} {last_name}. You are {age}. " +
       "You are a {profession}. You were a member of {affiliation}.")
      .format(first_name=first_name, last_name=last_name, age=age,
              profession=profession, affiliation=affiliation))

#### In order to print the format use the syntax: print("{:.2f}".format(3.1415926))

<table class="wp-block-table code" align="left">
    <tbody>
        <tr>
            <th width="16%">Number</th>
            <th width="16%">Format</th>
            <th width="16%">Output</th>
            <th>Description</th>
        </tr>
        <tr>
            <td>3.1415926</td>
            <td>{:.2f}</td>
            <td>3.14</td>
            <td>Format float 2 decimal places</td>
        </tr>
        <tr>
            <td>3.1415926</td>
            <td>{:+.2f}</td>
            <td>+3.14</td>
            <td>Format float 2 decimal places with sign</td>
        </tr>
        <tr>
            <td>-1</td>
            <td>{:+.2f}</td>
            <td>-1.00</td>
            <td>Format float 2 decimal places with sign</td>
        </tr>
        <tr>
            <td>2.71828</td>
            <td>{:.0f}</td>
            <td>3</td>
            <td>Format float with no decimal places</td>
        </tr>
        <tr>
            <td>5</td>
            <td>{:0&gt;2d}</td>
            <td>05</td>
            <td>Pad number with zeros (left padding, width 2)</td>
        </tr>
        <tr>
            <td>5</td>
            <td>{:x&lt;4d}</td>
            <td>5xxx</td>
            <td>Pad number with x’s (right padding, width 4)</td>
        </tr>
        <tr>
            <td>10</td>
            <td>{:x&lt;4d}</td>
            <td>10xx</td>
            <td>Pad number with x’s (right padding, width 4)</td>
        </tr>
        <tr>
            <td>1000000</td>
            <td>{:,}</td>
            <td>1,000,000</td>
            <td>Number format with comma separator</td>
        </tr>
        <tr>
            <td>0.25</td>
            <td>{:.2%}</td>
            <td>25.00%</td>
            <td>Format percentage</td>
        </tr>
        <tr>
            <td>1000000000</td>
            <td>{:.2e}</td>
            <td>1.00e+09</td>
            <td>Exponent notation</td>
        </tr>
        <tr>
            <td>13</td>
            <td>{:10d}</td>
            <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;13</td>
            <td>Right aligned (default, width 10)</td>
        </tr>
        <tr>
            <td>13</td>
            <td>{:&lt;10d}</td>
            <td>13</td>
            <td>Left aligned (width 10)</td>
        </tr>
        <tr>
            <td>13</td>
            <td>{:^10d}</td>
            <td>&nbsp;&nbsp;&nbsp;&nbsp;13</td>
            <td>Center aligned (width 10)</td>
        </tr>
    </tbody>
</table>

In [None]:
# Examples of number formatting with str.format()
print("{:.2f}".format(3.1415926))

print("{:.2%}".format(0.25))

print("{:0>8d}".format(42))

print("{:,}".format(1000000000))

print("{:.2e}".format(1000000000))

## f-strings

In [None]:
# Example (1):
name = "Eric"
age = 74
print(f"Hello, {name}. You are {age}.")

# Example (2):
print(F"Hello, {name}. You are {age}.")

# Example (3):
print(f"The result is: {15 * 15 / 10}")

# Example (4):
print(f"{name.lower()} is funny.")

# Example 4:
name = "Eric"
profession = "comedian"
affiliation = "Monty Python"
message = (f"Hi {name}. "
           f"You are a {profession}. "
           f"You were in {affiliation}.")
print(message)

#### Floating point values have the f suffix. We can also specify the precision: the number of decimal places. The precision is a value that goes right after the dot character.

In [None]:
# Format floats
pi = 3.1415926535897932384626433832795028841971693993751058209749445923078164062

# No specified format
print(f'{pi}')

# Zero decimal places
print(f'{pi:.0f}')

# Eight decimal places
print(f'{pi:.8f}')

# Two decimal places
print(f'{pi*10000:.2f}')

#### The width specifier sets the width of the value. The value may be filled with spaces or other characters if the value is shorter than the specified width.

In [None]:
# Format width
for x in range(1, 11):
    print(f'{x:2} | {x**2:3} | {x**5:6}')
    
# Format width and with and decimals
print('\n')
for x in range(1, 11):
    print(f'{x:5.2f} | {x**2:6.2f} | {x**5:9.2f}')

#### By default, the strings are justified to the left. We can use the > character to justify the strings to the right. The > character follows the colon character.

In [None]:
# Justify string
s1 = '123'
s2 = '1234'
s3 = '12345'
s4 = '123456'

print(f'{s1:>10}')
print(f'{s2:>10}')
print(f'{s3:>10}')
print(f'{s4:>10}')

#### The example displays a formatted current datetime. The datetime format specifiers follow the : character.

In [None]:
# Format_datetime
import datetime

now = datetime.datetime.now()
print(f'{now:%Y-%m-%d %H:%M:%S}')

#### Numbers can have various numeric notations, such as decadic or hexadecimal.

In [None]:
# Numeric notations
a = 300

# hexadecimal
print(f"{a:x}")

# octal
print(f"{a:o}")

# scientific
print(f"{a:e}")

# Basic date types in Python

In [None]:
# Use the dir() function to get a list containing all objects a module.
print(dir(datetime))

### Date object to represent a date

In [None]:
from datetime import date

# Date object to represent a date
a = date(2022, 3, 7)
print(a)

# Get current date
today = date.today()
print("Current date =", today)

### Time object to represent a time

In [None]:
from datetime import time

# time(hour = 0, minute = 0, second = 0)
a = time()
print("a =", a)

# time(hour, minute and second)
c = time(hour = 11, minute = 34, second = 56)
print("c =", c)

# time(hour, minute and second)
b = time(11, 34, 56)
print("b =", b)

### Datetime object to represent a datetime

In [None]:
from datetime import datetime

# datetime(year, month, day)
a = datetime(2022, 12, 31)
print(a)

# datetime(year, month, day, hour, minute, second)
b = datetime(2022, 12, 31, 23, 59, 59)
print(b)

### Timedelta object

In [None]:
from datetime import datetime

# Using date objects to calculate the time delta
t1 = date(year = 1984, month = 12, day = 31)
t2 = date(year = 2022, month = 12, day = 31)
t3 = t2 - t1
print(t3)
print("type of t3 =", type(t3))

# Using datetime objects to calculate the time delta
t4 = datetime(year = 1984, month = 12, day = 31, hour = 12)
t5 = datetime(year = 2022, month = 12, day = 31, hour = 4)
t6 = t5 - t4
print(t6)
print("type of t6 =", type(t6))

### Format datetime using strftime()

In [None]:
from datetime import datetime

# Current date and time
now = datetime.now()

# H:M:S format
t = now.strftime("%H:%M:%S")
print("time:", t)

# mm/dd/YY H:M:S format
s1 = now.strftime("%m/%d/%Y, %H:%M:%S")
print("s1:", s1)

# dd/mm/YY H:M:S format
s2 = now.strftime("%d/%m/%Y, %H:%M:%S")
print("s2:", s2)

### Handling time zone

In [None]:
from datetime import datetime

# Local time
local = datetime.now()
print("Local:", local.strftime("%m/%d/%Y, %H:%M:%S"))

# NY time
tz_NY = pytz.timezone('America/New_York')
datetime_NY = datetime.now(tz_NY)
print("New York:", datetime_NY.strftime("%m/%d/%Y, %H:%M:%S"))

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')