In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
# hide

dft = pd.DataFrame(
    {
        "str1": "foo",
        "str2": pd.Series(["bar"] * 3).astype("string"),
        "str3": pd.Series(["qux"] * 3).astype("category"),
        "float64": np.random.rand(3),
        "int64": 1,
        "float32": pd.Series(np.random.rand(3)).astype("float32"),
        "int8": pd.Series([1] * 3, dtype="int8"),
        "bool": False,
        "date1": pd.Timestamp("20010102"),
        "date2": pd.date_range("1/1/2015", periods=3),
        "date3": np.datetime64('2011-06-24'),
        "delta": [pd.Timedelta(days=i) for i in range(3)]
        
    }
)
dft

Unnamed: 0,str1,str2,str3,float64,int64,float32,int8,bool,date1,date2,date3,delta
0,foo,bar,qux,0.208548,1,0.143929,1,False,2001-01-02,2015-01-01,2011-06-24,0 days
1,foo,bar,qux,0.271333,1,0.710527,1,False,2001-01-02,2015-01-02,2011-06-24,1 days
2,foo,bar,qux,0.644316,1,0.352139,1,False,2001-01-02,2015-01-03,2011-06-24,2 days


In [3]:
# hide
dft.to_csv('datatypes_file.csv',index=False)
dfc = pd.read_csv('datatypes_file.csv')
dfc.dtypes

str1        object
str2        object
str3        object
float64    float64
int64        int64
float32    float64
int8         int64
bool          bool
date1       object
date2       object
date3       object
delta       object
dtype: object

In [4]:
# hide 
dft = pd.read_csv('datatypes_file.csv', dtype={
    "str2": "string",
    "str3": "category",
    "float32": np.float32,
    "bool": np.bool
}, parse_dates=['date1', 'date2'], converters={'delta': pd.to_timedelta})

dft['str2'].astype('string', copy=False) # Explicitly convert dtype
dft['int8'] = pd.to_numeric(dft['int8'], errors="coerce", downcast="integer") # Set invalid to nan and smallest dtype
dft['date3'] = pd.to_datetime(dft['date3'], format='%Y-%m-%d') # might be faster if date format is not standard
dft['delta'] = pd.to_timedelta(dft['delta']) # Another way

dft

Unnamed: 0,str1,str2,str3,float64,int64,float32,int8,bool,date1,date2,date3,delta
0,foo,bar,qux,0.208548,1,0.143929,1,False,2001-01-02,2015-01-01,2011-06-24,0 days
1,foo,bar,qux,0.271333,1,0.710527,1,False,2001-01-02,2015-01-02,2011-06-24,1 days
2,foo,bar,qux,0.644316,1,0.352139,1,False,2001-01-02,2015-01-03,2011-06-24,2 days


In [5]:
# hide
dft.dtypes

str1                object
str2                string
str3              category
float64            float64
int64                int64
float32            float32
int8                  int8
bool                  bool
date1       datetime64[ns]
date2       datetime64[ns]
date3       datetime64[ns]
delta      timedelta64[ns]
dtype: object

In [6]:
dfc.memory_usage(deep=True)

Index      128
str1       180
str2       180
str3       180
float64     24
int64       24
float32     24
int8        24
bool         3
date1      201
date2      201
date3      201
delta      189
dtype: int64

In [7]:
dft.memory_usage(deep=True)

Index      128
str1       180
str2       180
str3        63
float64     24
int64       24
float32     12
int8         3
bool         3
date1       24
date2       24
date3       24
delta       24
dtype: int64