# Data Loading, Storage, and File Formats


# Data Loading


In [None]:
## Reading text files:

Pandas feature a number of functions for reading tabular data as a
DataFrame object.

In [4]:
import pandas as pd

# Read data from exp1.csv
data = pd.read_csv('data/exp1.csv')
data

In [5]:
# We can also read_table command to read the csv , txt and xlxs file.

data = pd.read_table('data/exp1.csv')
data 

Unnamed: 0,"c1,c2,c3,c4,c5"
0,"a,b,c,d,message"
1,"1,2,3,4,hello"
2,"5,6,7,8,world"
3,9101112


In [21]:
# We can also specify the delimiter both in read_csv and read_table function.

data = pd.read_csv('data/exp1.csv', sep = ',')
data 

Unnamed: 0,c1,c2,c3,c4,c5
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo


In [14]:
# Header : A file will not always have a header row

data = pd.read_csv('data/mytest.csv')
print(data)

data = pd.read_csv('data/mytest.csv', header = None)
print(data)

   1  2  3
0  4  5  6
1  7  8  9
   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9


In [12]:
# Renaming the Header

pd.read_csv('data/mytest.csv', names=['header1', 'header2', 'header3'])

Unnamed: 0,header1,header2,header3
0,1,2,3
1,4,5,6
2,7,8,9


In [15]:
## Index Column : we can specify the index column 
## Here the index are 0 and 1, but now we want to make a column as index column.  

pd.read_csv('data/mytest.csv', names=['header1', 'header2', 'header3'] , index_col = 'header3' )

Unnamed: 0_level_0,header1,header2
header3,Unnamed: 1_level_1,Unnamed: 2_level_1
3,1,2
6,4,5
9,7,8


In [45]:

## index_col should be combination of key 1 and key 2: composite primary key
test_table = pd.read_csv('data/mindex.csv' , index_col = ['key1','key2'])
test_table

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [59]:
## Skip function 
# We can skip the first, third, and fourth rows of a file with 'skiprows' function:
data = pd.read_csv('data/exp1.csv', skiprows=[0, 2, 3])
data

Unnamed: 0,a,b,c,d,message
0,9,10,11,12,


# Null Checking


In [33]:
## Null checking in Data 
data = pd.read_csv('data/test.csv', names = ['col1','col2','col3'])
print(data)

## Isnull()
print("-----Isnull()----------")
print(pd.isnull(data))

## How to check notnull()
print("------notnull()---------")
## Print() ?? 

      col1  col2 col3
0      1.0     2    a
1      4.0     5    b
2      5.0     6    b
3      6.0     7    b
4      7.0     8    b
5      8.0     9    b
6      9.0    10    b
7     10.0    11    b
8     11.0    12    b
9     12.0    13    b
10    13.0    14    b
11    14.0    15    b
12    15.0    16    b
13    16.0    17    b
14    17.0    18    b
15    18.0    19    b
16    19.0    20    b
17    20.0    21    b
18    21.0    22    b
19    22.0    23    b
20    23.0    24    b
21    24.0    25    b
22    25.0    26    b
23    26.0    27    b
24    27.0    28    b
25    28.0    29    b
26    29.0    30    b
27    30.0    31    b
28    31.0    32    b
29    32.0    33    b
...    ...   ...  ...
1267   NaN  1271    t
1268   NaN  1272    t
1269   NaN  1273  NaN
1270   NaN  1274  NaN
1271   NaN  1275  NaN
1272   NaN  1276  NaN
1273   NaN  1277  NaN
1274   NaN  1278  NaN
1275   NaN  1279  NaN
1276   NaN  1280  NaN
1277   NaN  1281  NaN
1278   NaN  1282  NaN
1279   NaN  1283  NaN
1280   NaN

In [117]:
# We can use any() to check if any of he column is having 
data.isnull().values.any()

AttributeError: 'dict' object has no attribute 'isnull'

In [37]:
# To check how many null/Nan present in each column.
data.isnull().sum()

col1    237
col2      0
col3     28
dtype: int64

In [38]:
# if want to check in any particular column
# ????

237

In [39]:
# What if,we want to check more than one missing value condition  
# The 'na_values' option can take either a list or set of strings to consider as missing values:
data = pd.read_csv('data/exp1.csv', na_values=['NULL'])
data

Unnamed: 0,c1,c2,c3,c4,c5
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,


In [44]:
# Different NA sentinels can be specified for each column in a data frame:

sentinels = {'c5': ['NULL'], 'c4': ['d']}

data = pd.read_csv('data/exp1.csv', na_values = sentinels)
data


Unnamed: 0,c1,c2,c3,c4,c5
0,a,b,c,,message
1,1,2,3,4.0,hello
2,5,6,7,8.0,world
3,9,10,11,12.0,


# Reading File Into Pieces

In [113]:
## Read first two lines , hint nrows
## ???

In [115]:
chunker = pd.read_csv('data/test.csv', chunksize=10)
df2 = pd.concat([chunk for chunk in chunker], ignore_index=True)
df2


Unnamed: 0,1,2,a
0,4.0,5,b
1,5.0,6,b
2,6.0,7,b
3,7.0,8,b
4,8.0,9,b
5,9.0,10,b
6,10.0,11,b
7,11.0,12,b
8,12.0,13,b
9,13.0,14,b


# Writing Data Out to Text Format

In [49]:
# Using DataFrame’s to_csv method, we can write the data out to a comma-separated file:
# writing value from data to out.csv

data.to_csv('data/out.csv')
print (pd.read_csv('data/out.csv'))


None
   Unnamed: 0 c1  c2  c3  c4       c5
0           0  a   b   c   d  message
1           1  1   2   3   4    hello
2           2  5   6   7   8    world
3           3  9  10  11  12      NaN


In [52]:
import sys
# sys.stdout just prints the text result:
data.to_csv(sys.stdout, sep='|')

|c1|c2|c3|c4|c5
0|a|b|c|d|message
1|1|2|3|4|hello
2|5|6|7|8|world
3|9|10|11|12|


In [53]:
# Instead of missing values, if we want to display something:
data.to_csv(sys.stdout, na_rep='NULL')

,c1,c2,c3,c4,c5
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,NULL it is


In [62]:
# By default column name and index column is displayed but if we want to disable the row and column level:

print(data)
print("-----------------")
data.to_csv(sys.stdout, index=False, header=False)
# If we want to display 'Value not found' instead of black value:
# data.to_csv(sys.stdout, index=False, header=False ,  na_rep='NULL')

   a   b   c   d  message
0  9  10  11  12      NaN
-----------------
9,10,11,12,


In [64]:
# To write a subset of the columns, and in an order of your choosing:
#data
data.to_csv(sys.stdout, index=False, columns = ['b', 'c','a'])


b,c,a
10,11,9


# Manually Working with Delimited Formats

In [91]:
import csv
f = open('data/exp1.csv')
reader = csv.reader(f)
reader

<_csv.reader at 0x209dde37320>

In [93]:
# Iterate through each line .

for line in reader:
    print(line)

['c1', 'c2', 'c3', 'c4', 'c5']
['a', 'b', 'c', 'd', 'message']
['1', '2', '3', '4', 'hello']
['5', '6', '7', '8', 'world']
['9', '10', '11', '12', '']


In [96]:
# to get result like key,value pair like key will be column header and values are the column values.

lines = list(csv.reader(open('data/exp1.csv')))
header, values = lines[0], lines[1:]
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'c1': ('a', '1', '5', '9'),
 'c2': ('b', '2', '6', '10'),
 'c3': ('c', '3', '7', '11'),
 'c4': ('d', '4', '8', '12'),
 'c5': ('message', 'hello', 'world', '')}

# JSON Data

Working with large JSON datasets can be a pain, particularly when the file is large. 

In cases like this, a combination of command line tools and Python can make an efficient way to explore and analyze the data

In [89]:
import json

obj = '''{"name" : "Tom", 
         "place" : ["US","UK"], 
         "friends":[{"name":"Harry", "age":20},
                    {"name":"Peter", "age":25}] 
        }'''

#To convert Json string to Python form : json.loads()
result = json.loads(obj)
print(result)

{'name': 'Tom', 'place': ['US', 'UK'], 'friends': [{'name': 'Harry', 'age': 20}, {'name': 'Peter', 'age': 25}]}


In [92]:
# To select a subset of JSOn object 
friends = pd.DataFrame(result['friends'], columns=['name', 'age'])
friends


Unnamed: 0,name,age
0,Harry,20
1,Peter,25


In [105]:
#  To read a JSON file saved on the local machine or directly from the internet
ct = pd.read_json('data/example_2.json' , orient = 'columns')
ct

Unnamed: 0,quiz
maths,"{'q1': {'question': '5 + 7 = ?', 'options': ['..."
sport,{'q1': {'question': 'Which one is correct team...


# XML and HTML: Web Scraping

Web scraping refers to extracting data elements from webpages.

In [65]:
from lxml.html import parse
from urllib.request import urlopen

parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()
links = doc.findall('.//a')
# links

In [66]:
# These are obejects, we want to extract the link and content of that web page.
link = links[11]
link

# To get the link 
print(link.get('href'))

# To see the tab the link is associated with
print(link.text_content())

/quote/AAPL/holders?p=AAPL
Holders


In [128]:
# getting a list of all URLs in the document can be done using list comprehension:

urls = [link.get('href') for link in doc.findall('.//a')]

# Question : To see first 10 items: 
# urls[:10]

# Binary Data Formats

In [118]:
# To read zipped file:
import zipfile
archive = zipfile.ZipFile('data/T.zip')
df = archive.read('mindex-c.csv')

In [142]:
# To read Excel file 
xls_file = pd.ExcelFile('data/excel_example.xlsx')
xls_file

# Data stored in a sheet can be read into DataFrame using parse()

table = xls_file.parse('Sheet1')
table

Unnamed: 0,1,10,100
0,2,11,101
1,3,12,102
2,4,13,103
3,5,14,104
4,6,15,105
5,7,16,106
6,8,17,107
7,9,18,108
8,10,19,109
9,11,20,110


# Interacting with Databases

In [145]:
import sqlite3

## Query that will create a table 
query = """
        CREATE TABLE test
        (a VARCHAR(20), 
         b VARCHAR(20),
         c REAL, 
         d INTEGER
        );"""

con = sqlite3.connect(':memory:')
# To execute the query 
con.execute(query)
con.commit()

In [146]:
# Now lets insert few rows to the table 

data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]

stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
# con.execute only execute once but con.executemany execute for each data entry
con.executemany(stmt, data)
con.commit()

In [159]:
# To view the data stored in the table.
import pandas.io.sql as sql

sql.read_sql_query('select * from test',con)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
