In [1]:
import numpy as np
import pandas as pd

In [2]:
# file we'll use for this demostration:
# https://datacatalog.worldbank.org/dataset/education-statistics
# EdStats.csv
# 318 MB
# 886931 rows x 70 columns

In [3]:
%%time

# vanilla, with no tricks

df = pd.read_csv('.\\Edstats_csv\\EdStatsData.csv')
df.head()

Wall time: 7.2 s


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2060,2065,2070,2075,2080,2085,2090,2095,2100,Unnamed: 69
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,,,,,,...,,,,,,,,,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,,,,,,...,,,,,,,,,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,54.822121,54.894138,56.209438,57.267109,57.991138,59.36554,...,,,,,,,,,,


In [4]:
%%time

# if a file is compressed (gz, zip), no need to de-compress

df = pd.read_csv('.\\Edstats_csv\\EdStatsData.zip')
df.head()

Wall time: 13.1 s


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2060,2065,2070,2075,2080,2085,2090,2095,2100,Unnamed: 69
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,,,,,,...,,,,,,,,,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,,,,,,...,,,,,,,,,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,54.822121,54.894138,56.209438,57.267109,57.991138,59.36554,...,,,,,,,,,,


In [5]:
%%time

# take a peak at a fraction of the data to see what you might be able to filter out at read time

df = pd.read_csv('.\\Edstats_csv\\EdStatsData.csv', 
                 nrows=10)
df.head()

Wall time: 27 ms


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2060,2065,2070,2075,2080,2085,2090,2095,2100,Unnamed: 69
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,,,,,,...,,,,,,,,,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,,,,,,...,,,,,,,,,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,54.822121,54.894138,56.209438,57.267109,57.991138,59.36554,...,,,,,,,,,,


In [6]:
%%time

# select only the columns you really need 

#df.columns
df = pd.read_csv('.\\Edstats_csv\\EdStatsData.csv', 
                 usecols=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code','1970', '1971'])
df.head()

Wall time: 3.89 s


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,54.822121,54.894138


In [7]:
%%time

# use only as much precision as you need 

#df.columns
#df.dtypes
df = pd.read_csv('.\\Edstats_csv\\EdStatsData.csv', 
                 usecols=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code','1970', '1971'], 
                 dtype={'1970':np.float16, '1971':np.float16})
df.head()

Wall time: 4.89 s


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,54.8125,54.90625


In [8]:
%%time

# read only a random sample of the data

import random

fname = '.\\Edstats_csv\\EdStatsData.csv' 
n = sum(1 for ln in open(fname))-1                         # -1 to account for header
s = n//5                                                                    # sample size of 20%
skip = sorted(random.sample(range(1, n+1), n-s))  # 1 to n+1 to account for header 
df = pd.read_csv(fname, skiprows=skip)

print (n, s, df.shape)
df.head()

886930 177386 (177386, 70)
Wall time: 5.75 s


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2060,2065,2070,2075,2080,2085,2090,2095,2100,Unnamed: 69
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"Adjusted net enrolment rate, upper secondary, ...",UIS.NERA.3.GPI,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,Adjusted net intake rate to Grade 1 of primary...,UIS.NIRA.1.F,44.342487,44.255711,44.446136,45.270443,46.171814,47.062019,...,,,,,,,,,,
3,Arab World,ARB,Adjusted net intake rate to Grade 1 of primary...,UIS.NIRA.1.M,60.220436,60.386391,60.493145,61.059116,61.874802,62.355183,...,,,,,,,,,,
4,Arab World,ARB,"Adult illiterate population, 15+ years, female...",UIS.LP.AG15T99.F,,,,,,,...,,,,,,,,,,


In [9]:
%%time

# read smaller chunks at a time

chunkiter = pd.read_csv('.\\Edstats_csv\\EdStatsData.csv', 
                 chunksize=100000)

chunklst = []  
for c in chunkiter:  
    # typically of course you're not going to append all of c as shown below 
    # (since that will likely still give you a memory error), 
    # but instead append some filtered (and therefore smaller) part of c
    chunklst.append(c)
df = pd.concat(chunklst)

df.head()

Wall time: 8.29 s


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,...,2060,2065,2070,2075,2080,2085,2090,2095,2100,Unnamed: 69
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.F,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.GPI,,,,,,,...,,,,,,,,,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, ...",UIS.NERA.2.M,,,,,,,...,,,,,,,,,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sex...",SE.PRM.TENR,54.822121,54.894138,56.209438,57.267109,57.991138,59.36554,...,,,,,,,,,,
