In [1]:
import pandas as pd

from io import StringIO

data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [2]:
#Returning a subset 
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"])

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [3]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [4]:
#skip_blank_linesboolean, default True. If True, skip over blank lines rather than interpreting as NaN values.
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)

Unnamed: 0,col1,col2,col3
0,a,b,2


In [5]:
#Specifying column data types. You can indicate the data type for the whole DataFrame or individual columns:
import numpy as np

data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"

print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [6]:
df = pd.read_csv(StringIO(data), dtype=object)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [7]:
df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"})
df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [8]:
data = "col_1\n1\n2\n'A'\n4.22"
df = pd.read_csv(StringIO(data), converters={"col_1": str})
df["col_1"].apply(type).value_counts()

col_1
<class 'str'>    4
Name: count, dtype: int64

In [9]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [10]:
pd.read_csv(StringIO(data)).dtypes

col1    object
col2    object
col3     int64
dtype: object

In [11]:
pd.read_csv(StringIO(data), dtype="category").dtypes

col1    category
col2    category
col3    category
dtype: object

In [12]:
#Individual columns can be parsed as a Categorical using a dict specification:
pd.read_csv(StringIO(data), dtype={"col1": "category"}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [13]:
#Naming and using columns
#A file may or may not have a header row. pandas assumes the first row should be used as the column names:
data = "a,b,c\n1,2,3\n4,5,6\n7,8,9"

print(data)

a,b,c
1,2,3
4,5,6
7,8,9


In [14]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [15]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=0)

Unnamed: 0,foo,bar,baz
0,1,2,3
1,4,5,6
2,7,8,9


In [16]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=None)

Unnamed: 0,foo,bar,baz
0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


In [17]:
#Filtering columns (usecols)
data = "a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz"
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c,d
0,1,2,3,foo
1,4,5,6,bar
2,7,8,9,baz


In [18]:
pd.read_csv(StringIO(data), usecols=["b", "d"])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [19]:
pd.read_csv(StringIO(data), usecols=[0, 2, 3])

Unnamed: 0,a,c,d
0,1,3,foo
1,4,6,bar
2,7,9,baz


In [20]:
'''
Dealing with Unicode data
The encoding argument should be used for encoded unicode data, which will result in byte strings being decoded to unicode in the result
'''
from io import BytesIO
data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5"
data = data.decode("utf8").encode("latin-1")
df = pd.read_csv(BytesIO(data), encoding="latin-1")
df

Unnamed: 0,word,length
0,Träumen,7
1,Grüße,5


In [21]:
df["word"][1]

'Grüße'

In [22]:
#Booleans handling
data = "a,b,c\n1,Yes,2\n3,No,4"
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,Yes,2
1,3,No,4


In [23]:
pd.read_csv(StringIO(data), true_values=["Yes"], false_values=["No"])

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4


In [24]:
#Handling bad lines
data = "a,b,c\n1,2,3\n4,5,6,7\n8,9,10"
pd.read_csv(StringIO(data), on_bad_lines="skip")

Unnamed: 0,a,b,c
0,1,2,3
1,8,9,10


In [25]:
#Quoting and escpe characters
data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'
print(data)

a,b
"hello, \"Bob\", nice to see you",5


In [26]:
pd.read_csv(StringIO(data), escapechar="\\")

Unnamed: 0,a,b
0,"hello, ""Bob"", nice to see you",5


In [29]:
# Column specifications are a list of half-intervals
colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]

df = pd.read_fwf(r"C:\Users\Lenovo\Downloads\Compressed\archive\resume_data.csv", colspecs=colspecs, header=None, index_col=0)

df

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
addres,career_objec,"ive,skills,e",ucational
",Big d",a analytics,orking and d,tabase wa
Troubl,hooting,,
Collab,ation,,
Docume,ation,,
...,...,...,...
Compli,ce,,
Site V,"its"",""AutoCA",,
ETABS,,,
Micros,t Office Sui,e,


In [32]:
dfj = pd.DataFrame(np.random.randn(5, 2), columns=list("AB"))
json = dfj.to_json()
json

'{"A":{"0":-1.4259500671,"1":1.2239581744,"2":-0.8799191929,"3":0.7785973935,"4":1.0647614076},"B":{"0":1.2063177999,"1":-0.7412305071,"2":0.701068966,"3":-0.3941271842,"4":0.0131758349}}'

In [34]:
#Date handling
dfd = pd.DataFrame(np.random.randn(5, 2), columns=list("AB"))

dfd["date"] = pd.Timestamp("20130101")

dfd = dfd.sort_index(axis=1, ascending=False)

json = dfd.to_json(date_format="iso")

json

'{"date":{"0":"2013-01-01T00:00:00.000","1":"2013-01-01T00:00:00.000","2":"2013-01-01T00:00:00.000","3":"2013-01-01T00:00:00.000","4":"2013-01-01T00:00:00.000"},"B":{"0":1.4646943982,"1":-1.0591096762,"2":-0.3769307384,"3":0.0759773716,"4":-0.5894944786},"A":{"0":0.0096280034,"1":0.9523950318,"2":0.0456622965,"3":-0.8727695464,"4":-0.2614709754}}'

In [35]:
dfj2 = dfj.copy()

dfj2["date"] = pd.Timestamp("20130101")

dfj2["ints"] = list(range(5))

dfj2["bools"] = True

dfj2.index = pd.date_range("20130101", periods=5)

dfj2.to_json("test.json")

with open("test.json") as fh:
    print(fh.read())

{"A":{"1356998400000":-1.4259500671,"1357084800000":1.2239581744,"1357171200000":-0.8799191929,"1357257600000":0.7785973935,"1357344000000":1.0647614076},"B":{"1356998400000":1.2063177999,"1357084800000":-0.7412305071,"1357171200000":0.701068966,"1357257600000":-0.3941271842,"1357344000000":0.0131758349},"date":{"1356998400000":1356,"1357084800000":1356,"1357171200000":1356,"1357257600000":1356,"1357344000000":1356},"ints":{"1356998400000":0,"1357084800000":1,"1357171200000":2,"1357257600000":3,"1357344000000":4},"bools":{"1356998400000":true,"1357084800000":true,"1357171200000":true,"1357257600000":true,"1357344000000":true}}
