# 03 - Creating and Persisting DataFrames

In [25]:
import pandas as pd
import numpy as np
import os

dataPath = os.path.dirname(os.getcwd()) + '\Data'

## Creating DataFrames from scratch

In [2]:
fname = ["Paul", "John", "Richard", "George"]
lname = ["McCartney", "Lennon", "Starkey", "Harrison"]
birth = [1942, 1940, 1940, 1943]

In [3]:
people = {"first": fname, "last": lname, "birth": birth}
people

{'first': ['Paul', 'John', 'Richard', 'George'],
 'last': ['McCartney', 'Lennon', 'Starkey', 'Harrison'],
 'birth': [1942, 1940, 1940, 1943]}

In [4]:
beatles = pd.DataFrame(people)
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [6]:
beatles.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
beatles = pd.DataFrame(people, index=["a", "b", "c", "d"])
beatles

Unnamed: 0,first,last,birth
a,Paul,McCartney,1942
b,John,Lennon,1940
c,Richard,Starkey,1940
d,George,Harrison,1943


You can also create a DataFrame from a list of dictionaries:

In [5]:
pd.DataFrame(
     [
         {
             "first": "Paul",
             "last": "McCartney",
             "birth": 1942,
         },
         {
             "first": "John",
             "last": "Lennon",
             "birth": 1940,
         },
         {
             "first": "Richard",
             "last": "Starkey",
             "birth": 1940,
         },
         {
             "first": "George",
             "last": "Harrison",
             "birth": 1943,
         },
     ]
)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


## Writing CSV

In [9]:
beatles = pd.DataFrame(people)
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [17]:
from io import StringIO
fout = StringIO()

beatles.to_csv(fout)
print(fout.getvalue())

,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943



## Reading large CSV files

The pandas library is an in-memory tool. You need to be able to fit your data in memory to use pandas with it. If you come across a large CSV file that you want to process, you have a few options. If you can process portions of it at a time, you can read it into chunks and process each chunk. Alternatively, if you know that you should have enough memory to load the file, there are a few hints to help pare down the file size.

Note that in general, you should have three to ten times the amount of memory as the size of the DataFrame that you want to manipulate. Extra memory should give you enough extra space to perform many of the common operations.

How to do it...
In this section, we will look at the diamonds dataset. This dataset easily fits into the memory of my 2015 MacBook, but let's pretend that the file is a lot bigger than it is, or that the memory of my machine is limited such that when pandas tries to load it with the read_csv function, I get a memory error.

Determine how much memory the whole file will take up. We will use the nrows parameter of read_csv to limit how much data we load to a small sample:

In [47]:
diamonds = pd.read_csv(dataPath + "\diamonds.csv", nrows=1000)
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
995,0.54,Ideal,D,VVS2,61.4,52.0,2897,5.30,5.34,3.26
996,0.72,Ideal,E,SI1,62.5,55.0,2897,5.69,5.74,3.57
997,0.72,Good,F,VS1,59.4,61.0,2897,5.82,5.89,3.48
998,0.74,Premium,D,VS2,61.8,58.0,2897,5.81,5.77,3.58


In [48]:
diamonds.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

We can see that 1,000 rows use about 78.2 KB of memory. If we had 1 billion rows, that would take about 78 GB of memory. It turns out that it is possible to rent machines in the cloud that have that much memory but let's see if we can take it down a little.

In [49]:
diamonds.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 232.0 KB


In [45]:
diamonds2 = pd.read_csv(
     dataPath + "\diamonds.csv",
     nrows=1000,
     dtype={
         "carat": np.float32,
         "depth": np.float32,
         "table": np.float32,
         "x": np.float32,
         "y": np.float32,
         "z": np.float32,
         "price": np.int16,
         "cut":'category',
         "color":'category',
         "clarity":'category'
     },
)

diamonds2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.799999,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.900002,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.400002,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.299999,58.0,335,4.34,4.35,2.75


In [51]:
diamonds2.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
 7   x        1000 non-null   float32 
 8   y        1000 non-null   float32 
 9   z        1000 non-null   float32 
dtypes: category(3), float32(6), int16(1)
memory usage: 30.4 KB


Make sure that summary statistics are similar with our new dataset to the original:

In [61]:
originalDF_size = diamonds.memory_usage(deep = True).sum()
updatedDF_size = diamonds2.memory_usage(deep = True).sum()
pctDecrease = ((originalDF_size - updatedDF_size) / originalDF_size) * 100
print(f'Original Size:{originalDF_size}')
print(f'New Size:{updatedDF_size}')
print(f"DataFrame was reduced by {pctDecrease.round()} percent")

Original Size:237538
New Size:31100
DataFrame was reduced by 87.0 percent


If we use int8 for the price, we will lose information. You can use the NumPy iinfo function to list limits for NumPy integer types:

In [64]:
np.iinfo(np.int8)

iinfo(min=-128, max=127, dtype=int8)

In [66]:
np.finfo(np.float16)

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

Once you have your data in a format you like, you can save it in a binary format that tracks types, such as the Feather format (pandas leverages the pyarrow library to do this). This format is meant to enable in-memory transfer of structured data between languages and optimized so that data can be used as is without internal conversion. Reading from this format is much quicker and easy once you have the types defined:

## Using Excel files