In [3]:
import pandas as pd

from io import StringIO

data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [4]:
#Returning a subset 
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"])

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [5]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [6]:
#skip_blank_linesboolean, default True. If True, skip over blank lines rather than interpreting as NaN values.
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)

Unnamed: 0,col1,col2,col3
0,a,b,2


In [7]:
#Specifying column data types. You can indicate the data type for the whole DataFrame or individual columns:
import numpy as np

data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"

print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [8]:
df = pd.read_csv(StringIO(data), dtype=object)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [9]:
df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"})
df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [10]:
data = "col_1\n1\n2\n'A'\n4.22"
df = pd.read_csv(StringIO(data), converters={"col_1": str})
df["col_1"].apply(type).value_counts()

col_1
<class 'str'>    4
Name: count, dtype: int64

In [11]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [12]:
pd.read_csv(StringIO(data)).dtypes

col1    object
col2    object
col3     int64
dtype: object

In [13]:
pd.read_csv(StringIO(data), dtype="category").dtypes

col1    category
col2    category
col3    category
dtype: object

In [14]:
#Individual columns can be parsed as a Categorical using a dict specification:
pd.read_csv(StringIO(data), dtype={"col1": "category"}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [15]:
#Naming and using columns
#A file may or may not have a header row. pandas assumes the first row should be used as the column names:
data = "a,b,c\n1,2,3\n4,5,6\n7,8,9"

print(data)

a,b,c
1,2,3
4,5,6
7,8,9


In [16]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [17]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=0)

Unnamed: 0,foo,bar,baz
0,1,2,3
1,4,5,6
2,7,8,9


In [18]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=None)

Unnamed: 0,foo,bar,baz
0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


In [20]:
#Filtering columns (usecols)
data = "a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz"
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c,d
0,1,2,3,foo
1,4,5,6,bar
2,7,8,9,baz


In [21]:
pd.read_csv(StringIO(data), usecols=["b", "d"])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [22]:
pd.read_csv(StringIO(data), usecols=[0, 2, 3])

Unnamed: 0,a,c,d
0,1,3,foo
1,4,6,bar
2,7,9,baz


In [23]:
'''
Dealing with Unicode data
The encoding argument should be used for encoded unicode data, which will result in byte strings being decoded to unicode in the result
'''
from io import BytesIO
data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5"
data = data.decode("utf8").encode("latin-1")
df = pd.read_csv(BytesIO(data), encoding="latin-1")
df

Unnamed: 0,word,length
0,Träumen,7
1,Grüße,5


In [24]:
df["word"][1]

'Grüße'