In [1]:
import numpy as np
import pandas as pd

##### data alignment is intrinsic

#### Series is one-dimensional labeled array capable of holding any data type.
#### s = pd.Series(data, index=index)
- a python dict
- an ndarray
- a scalar value



##### From ndarray

In [6]:
s = pd.Series(np.random.randn(5), index=["a","b","c","d","e"])
s

a   -0.256920
b   -1.849074
c   -1.047448
d    0.893396
e   -0.456790
dtype: float64

In [7]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [8]:
pd.Series(np.random.randn(5))

0   -0.755346
1   -0.935909
2    0.590450
3    1.611297
4    1.404863
dtype: float64

##### From dict

In [9]:
d = {"b":1, "a":0, "c":2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [12]:
d = {"a": 0.0, "b": 1.0, "c": 2.0}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [14]:
pd.Series(d, index=["b","c","d","a"])
# NaN (not a number) is the standard missing data marker used in pandas

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

##### From scalar value

In [15]:
# if data is a scalar value, an index must be provided, The value will be repeated to match the lenght of index
pd.Series(5.0,  index=["a","b","c","d","e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

##### Series is ndarray-like

In [16]:
# Series acts very similarly to a ndarray and is a valid argument to most NumPy functions
s[0]

-0.2569201735521316

In [17]:
s[:3]

a   -0.256920
b   -1.849074
c   -1.047448
dtype: float64

In [18]:
s[s>s.median()]

a   -0.256920
d    0.893396
dtype: float64

In [19]:
s[[4,3,1]]

e   -0.456790
d    0.893396
b   -1.849074
dtype: float64

In [20]:
np.exp(s)

a    0.773430
b    0.157383
c    0.350832
d    2.443412
e    0.633313
dtype: float64

In [21]:
# for actual array backing a series
s.array

<PandasArray>
[-0.2569201735521316, -1.8490743361708586, -1.0474476859328117,
  0.8933955577029516, -0.4567903997203833]
Length: 5, dtype: float64

In [22]:
# for actual ndarrray
s.to_numpy()

array([-0.25692017, -1.84907434, -1.04744769,  0.89339556, -0.4567904 ])

##### Series is dict-like

In [24]:
#A series is like a fixed-size dict in that you can get and set value by index label:

s["a"]

-0.2569201735521316

In [26]:
s["e"]=12.0
s

a    -0.256920
b    -1.849074
c    -1.047448
d     0.893396
e    12.000000
dtype: float64

In [27]:
"e" in s

True

In [28]:
"f" in s

False

In [33]:
# if a label is not contained, an exception is raised: eg:s["f"]
s.get("f")
s.get("f", np.nan)

nan

#### Vectorized operations and label alignment with Series

In [34]:
s+s

a    -0.513840
b    -3.698149
c    -2.094895
d     1.786791
e    24.000000
dtype: float64

In [39]:
s*2

a    -0.513840
b    -3.698149
c    -2.094895
d     1.786791
e    24.000000
dtype: float64

In [40]:
np.exp(s)

a         0.773430
b         0.157383
c         0.350832
d         2.443412
e    162754.791419
dtype: float64

In [43]:
# the result of an operation between unaligned Series will have the union of the indexes involved
# if the label is not found in one Series or the other, the result will be marked as missing NaN

s[1:] + s[:-1]

# you can drop labels with missing data via the dropna function

a         NaN
b   -3.698149
c   -2.094895
d    1.786791
e         NaN
dtype: float64

#### Name attribute

In [44]:
# Series can also have a name attribute
s = pd.Series(np.random.randn(5), name="something")
s

0    0.382373
1   -1.159646
2   -1.537768
3   -0.198091
4    0.679587
Name: something, dtype: float64

In [45]:
s.name

'something'

In [46]:
s2=s.rename("different")
s2.name

'different'

In [47]:
# note that s and s2 refers to different objects

## DataFrame

**DataFrame** is a 2-dimensional labeled data structure with columns of potentially different types.
DataFrame accepts many different kinds of input:
- Dict of 1D ndarrays, list, dict, or Series
- 2-D numpy.ndarray
- Structured or record ndarray
- A Series
- Another DataFrame

#### From dict of Series or dicts

In [51]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a","b","c","d"])
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [52]:
pd.DataFrame(d, index=["d","b","a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [53]:
pd.DataFrame(d, index=["d","b","a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [54]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [55]:
df.columns

Index(['one', 'two'], dtype='object')

#### From dict of ndarrays/lists

In [56]:
# The ndarrays must all be the same length. If an index is passed, it must clearly also be the same length as the arrays.
# If no index is passed, the result will be range(n), where n is the array length.

d={"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}

In [57]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [58]:
pd.DataFrame(d, index=["a","b","c","d"])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


#### From structured or record array

In [65]:
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
data

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [66]:
pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [67]:
pd.DataFrame(data, index=["first", "second"])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [68]:
pd.DataFrame(data, columns =["C","A","B"])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


#### From a list of dicts

In [71]:
data2= [{"a":1, "b":2}, {"a":5, "b":10, "c":20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [72]:
pd.DataFrame(data2, index=["first", "second"])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [73]:
pd.DataFrame(data2, columns=["a","b"])

Unnamed: 0,a,b
0,1,2
1,5,10


#### From a dict of tuples

In [79]:
pd.DataFrame(
{
    ("a","b"): {("A","B"):1, ("A","C"): 2},
    ("a","a"): {("A","C"):3 ,("A","B"): 4},
    ("a","c"): {("A","B"):5 ,("A","C"): 6},
    ("b","a"): {("A","C"):7 ,("A","B"): 8},
    ("b","b"): {("A","D"):9, ("A","B"):10}
    
}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


#### From a list of namedtuples

In [81]:
from collections import namedtuple

Point = namedtuple("Point", "x y")
pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


In [82]:
Point3D = namedtuple("Point3D", "x y z")
pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)])

Unnamed: 0,x,y,z
0,0,0,0.0
1,0,3,5.0
2,2,3,


#### Alternate constructors
##### DataFrame.from_dict

In [91]:
# DataFrame.from_dict takes a dict of dicts or dit of array-like sequences and returns a DataFrame.

pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [95]:
# if you pass orient='index', the key will be the row labels.
pd.DataFrame.from_dict( 
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"]
)

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


##### DataFrame.from_records

In [96]:
# DataFrame.from_records takes a list of tuples or an ndarray with structured dtype.
data

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [97]:
pd.DataFrame.from_records(data, index="C")

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
b'Hello',1,2.0
b'World',2,3.0


### Column selection, addition, deletion

In [98]:
df["one"]

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [101]:
df["three"] = df["one"] * df["two"]
df["flag"] = df["one"] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [105]:
# columns can be deleted or popped like with a dict
del df["two"]
three = df.pop("three")
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [108]:
df["foo"]="bar"
df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [110]:
# inserting
df["one_trunc"] = df["one"][:2]
df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [113]:
df.insert(1, "bar", df["one"])
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


#### Asssigning new columns in method chains

In [123]:
iris = pd.read_csv("iris.data", names=["SepalLength","SepalWidth","PetalLength","PetalWidth","Name"])
iris.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [124]:
# assign always returns a copy of the data, leaving the original DataFrame untouched
iris.assign(sepal_ration=lambda x: (x["SepalWidth"]/x["SepalLength"])).head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name,sepal_ration
0,5.1,3.5,1.4,0.2,Iris-setosa,0.686275
1,4.9,3.0,1.4,0.2,Iris-setosa,0.612245
2,4.7,3.2,1.3,0.2,Iris-setosa,0.680851
3,4.6,3.1,1.5,0.2,Iris-setosa,0.673913
4,5.0,3.6,1.4,0.2,Iris-setosa,0.72


In [126]:
dfa = pd.DataFrame({"A":[1, 2, 3], "B":[4, 5, 6]})
dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])

Unnamed: 0,A,B,C,D
0,1,4,5,6
1,2,5,7,9
2,3,6,9,12


#### Indexing/ selection

| Operation      | Syntax | Result|
| ----------- | ----------- | ------|
| Select column | df[col] | Series|
| Select row by label | df.loc[label]| Series|
| Select row by integer location | df.iloc[loc] | Series |
| Slice rows | df[5:10] | DataFrame |
| Select rows by boolean vector | df[bool_vec] | DataFrame |

In [129]:
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


In [130]:
df.loc["b"]

one              2
bar              2
flag         False
foo            bar
one_trunc        2
Name: b, dtype: object

In [132]:
df.iloc[2]

one             3
bar             3
flag         True
foo           bar
one_trunc     NaN
Name: c, dtype: object

#### Data alignment and arithmetic

In [133]:
# Data alignment between DataFrame objects automatically align on both the columns and the index (row lables)
df=pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2=pd.DataFrame(np.random.randn(7,3), columns=["A","B", "C"])

In [134]:
df+df2

Unnamed: 0,A,B,C,D
0,-1.608739,1.725448,-0.81034,
1,-1.307813,0.752213,1.151402,
2,3.136026,-2.003914,0.454505,
3,0.992711,2.032895,-0.737327,
4,1.61038,-1.558648,0.604081,
5,-1.482753,0.240306,0.80046,
6,0.450548,-0.033669,-0.615715,
7,,,,
8,,,,
9,,,,


In [135]:
df- df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,0.049823,1.519406,-0.419006,0.311296
2,2.615737,0.202184,0.544685,1.656988
3,-0.633865,1.521333,-0.40169,1.718047
4,1.882928,-0.944155,-0.591492,-0.736519
5,-0.587629,0.476882,-0.954578,-0.07122
6,0.705107,1.321501,-0.321836,1.402506
7,-0.033107,-0.359232,-0.189351,-0.279653
8,0.565297,-0.543807,-0.41559,-0.473101
9,1.178019,1.767407,0.124514,0.711145


In [137]:
# Boolean operators work as well
df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [138]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [139]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [140]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


#### Transposing

In [144]:
# To transpose, access T attribute (also the transpose function)
df[:5].T

Unnamed: 0,0,1,2,3,4
A,-0.546575,-0.496752,2.069162,-1.18044,1.336353
B,-0.369044,1.150362,-0.166861,1.152289,-1.3132
C,0.300832,-0.118174,0.845518,-0.100857,-0.290659
D,-0.610926,-0.29963,1.046061,1.107121,-1.347445


#### Console Display

In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       10 non-null     float64
 1   B       10 non-null     float64
 2   C       10 non-null     float64
 3   D       10 non-null     float64
dtypes: float64(4)
memory usage: 448.0 bytes


In [147]:
df.iloc[1:4, 1:5].to_string()

'          B         C         D\n1  1.150362 -0.118174 -0.299630\n2 -0.166861  0.845518  1.046061\n3  1.152289 -0.100857  1.107121'