In [1]:
import pandas as pd
import numpy as np

In [2]:
series = pd.Series([1,3, 5, 6, 8,  np.nan])
series

0    1.0
1    3.0
2    5.0
3    6.0
4    8.0
5    NaN
dtype: float64

In [3]:

df = pd.DataFrame({'A': [1,2,3,4,5,6,7,8,9],
                   'B': [11,12,13,14,15,16,17,18,19] 
})
df

Unnamed: 0,A,B
0,1,11
1,2,12
2,3,13
3,4,14
4,5,15
5,6,16
6,7,17
7,8,18
8,9,19


# Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:

In [4]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
dates.shape

(6,)

In [6]:
df = pd.DataFrame(np.random.randn(6, 5), index=dates, columns=list("ABCDF"))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216
2013-01-06,0.369782,-0.719802,0.583105,-0.918988,-0.083442


# Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [7]:
df2 = pd.DataFrame(
    {
        "A": 2.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,2.0,2013-01-02,1.0,3,test,foo
1,2.0,2013-01-02,1.0,3,train,foo
2,2.0,2013-01-02,1.0,3,test,foo
3,2.0,2013-01-02,1.0,3,train,foo


In [8]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

# Viewing data

In [9]:
df.head()

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216


In [10]:
df.tail()

Unnamed: 0,A,B,C,D,F
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216
2013-01-06,0.369782,-0.719802,0.583105,-0.918988,-0.083442


In [11]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D', 'F'], dtype='object')

In [13]:
df.describe

<bound method NDFrame.describe of                    A         B         C         D         F
2013-01-01  1.403103  0.805152 -0.273527  0.801805 -2.131466
2013-01-02 -1.016615  1.690678  0.330468 -0.250786 -0.320263
2013-01-03  0.060715  2.849912 -1.200424 -0.384490  1.198775
2013-01-04  1.572195 -1.235836  0.554144  0.632769  0.699379
2013-01-05 -2.028985  1.236605 -0.628177  2.016544 -0.032160
2013-01-06  0.369782 -0.719802  0.583105 -0.918988 -0.083442>

Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels: Numpy Array have one dtype for entire array while Dataframe have onedtype per column

In [14]:
df.to_numpy()

array([[ 1.40310307,  0.80515155, -0.27352706,  0.80180543, -2.13146617],
       [-1.0166146 ,  1.69067777,  0.33046791, -0.25078626, -0.32026307],
       [ 0.06071518,  2.84991237, -1.20042444, -0.38448955,  1.19877546],
       [ 1.57219476, -1.23583594,  0.55414442,  0.63276925,  0.69937874],
       [-2.0289846 ,  1.23660467, -0.62817714,  2.0165443 , -0.0321602 ],
       [ 0.36978167, -0.71980201,  0.58310539, -0.91898798, -0.08344229]])

In [15]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [16]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
F    float64
dtype: object

In [17]:
df2.to_numpy()

array([[2.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [2.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [2.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [2.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [18]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [19]:
df.describe()

Unnamed: 0,A,B,C,D,F
count,6.0,6.0,6.0,6.0,6.0
mean,0.060033,0.771118,-0.105735,0.316143,-0.11153
std,1.393526,1.525485,0.721063,1.055019,1.140638
min,-2.028985,-1.235836,-1.200424,-0.918988,-2.131466
25%,-0.747282,-0.338564,-0.539515,-0.351064,-0.261058
50%,0.215248,1.020878,0.02847,0.190991,-0.057801
75%,1.144773,1.577159,0.498225,0.759546,0.516494
max,1.572195,2.849912,0.583105,2.016544,1.198775


In [20]:
df.head()

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216


## Transposing your data:

In [21]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.403103,-1.016615,0.060715,1.572195,-2.028985,0.369782
B,0.805152,1.690678,2.849912,-1.235836,1.236605,-0.719802
C,-0.273527,0.330468,-1.200424,0.554144,-0.628177,0.583105
D,0.801805,-0.250786,-0.38449,0.632769,2.016544,-0.918988
F,-2.131466,-0.320263,1.198775,0.699379,-0.03216,-0.083442


In [22]:
df.head()

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216


DataFrame.sort_index() sorts by an axis:

In [23]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,F,D,C,B,A
2013-01-01,-2.131466,0.801805,-0.273527,0.805152,1.403103
2013-01-02,-0.320263,-0.250786,0.330468,1.690678,-1.016615
2013-01-03,1.198775,-0.38449,-1.200424,2.849912,0.060715
2013-01-04,0.699379,0.632769,0.554144,-1.235836,1.572195
2013-01-05,-0.03216,2.016544,-0.628177,1.236605,-2.028985
2013-01-06,-0.083442,-0.918988,0.583105,-0.719802,0.369782


In [24]:
df.sort_index(axis=0, ascending=True)

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216
2013-01-06,0.369782,-0.719802,0.583105,-0.918988,-0.083442


DataFrame.sort_values() sorts by values:

In [25]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D,F
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-06,0.369782,-0.719802,0.583105,-0.918988,-0.083442
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775


# Getitem ([])

In [26]:
df[["A", "B", "C"]]

Unnamed: 0,A,B,C
2013-01-01,1.403103,0.805152,-0.273527
2013-01-02,-1.016615,1.690678,0.330468
2013-01-03,0.060715,2.849912,-1.200424
2013-01-04,1.572195,-1.235836,0.554144
2013-01-05,-2.028985,1.236605,-0.628177
2013-01-06,0.369782,-0.719802,0.583105


In [27]:
df["A"]

2013-01-01    1.403103
2013-01-02   -1.016615
2013-01-03    0.060715
2013-01-04    1.572195
2013-01-05   -2.028985
2013-01-06    0.369782
Freq: D, Name: A, dtype: float64

DataFrame Slice : selects matching rows:

In [28]:
df[0:4]

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379


In [29]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D,F
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379


Selection by label. Selection by Label using DataFrame.loc() or DataFrame.at().

Selecting a row matching a label:

In [30]:
df.loc[dates[0]]

A    1.403103
B    0.805152
C   -0.273527
D    0.801805
F   -2.131466
Name: 2013-01-01 00:00:00, dtype: float64

Selecting all rows (:) with a select column labels:

In [31]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,1.403103,0.805152
2013-01-02,-1.016615,1.690678
2013-01-03,0.060715,2.849912
2013-01-04,1.572195,-1.235836
2013-01-05,-2.028985,1.236605
2013-01-06,0.369782,-0.719802


For label slicing, both endpoints are included:

In [32]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,-1.016615,1.690678
2013-01-03,0.060715,2.849912
2013-01-04,1.572195,-1.235836


Selecting a single row and column label returns a scalar:

In [33]:
df.loc[dates[0], "A"]

1.4031030706457839

Selecting a single row and column label returns a scalar:

In [34]:
df.at[dates[1], "A"]

-1.016614602928114

# Selection by position

In [35]:
df.iloc[0:4]

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379


In [36]:
df.iloc[3]

A    1.572195
B   -1.235836
C    0.554144
D    0.632769
F    0.699379
Name: 2013-01-04 00:00:00, dtype: float64

In [37]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.572195,-1.235836
2013-01-05,-2.028985,1.236605


Lists of integer position locations:

In [38]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-1.016615,0.330468
2013-01-03,0.060715,-1.200424
2013-01-05,-2.028985,-0.628177


For slicing rows explicitly:

In [39]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D,F
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775


For slicing columns explicitly:

In [40]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.805152,-0.273527
2013-01-02,1.690678,0.330468
2013-01-03,2.849912,-1.200424
2013-01-04,-1.235836,0.554144
2013-01-05,1.236605,-0.628177
2013-01-06,-0.719802,0.583105


For getting a value explicitly:

In [41]:
df.iat[1, 1]

1.6906777705792295

In [42]:
df.iat[1, 2]

0.3304679132091879

# Boolean indexing
Select rows where df.A is greater than 0

In [43]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-06,0.369782,-0.719802,0.583105,-0.918988,-0.083442


Selecting values from a DataFrame where a boolean condition is met:

In [44]:
df[df > 0.5]

Unnamed: 0,A,B,C,D,F
2013-01-01,1.403103,0.805152,,0.801805,
2013-01-02,,1.690678,,,
2013-01-03,,2.849912,,,1.198775
2013-01-04,1.572195,,0.554144,0.632769,0.699379
2013-01-05,,1.236605,,2.016544,
2013-01-06,,,0.583105,,


Using isin() method for filtering:

In [45]:
dfcopy = df.copy()
dfcopy["E"] = ["one", "one", "two", "three", "four", "three"]
dfcopy

Unnamed: 0,A,B,C,D,F,E
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466,one
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263,one
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775,two
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379,three
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216,four
2013-01-06,0.369782,-0.719802,0.583105,-0.918988,-0.083442,three


In [46]:
dfcopy[dfcopy["E"].isin(["one", "two", "four"])]


Unnamed: 0,A,B,C,D,F,E
2013-01-01,1.403103,0.805152,-0.273527,0.801805,-2.131466,one
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263,one
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775,two
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216,four


Setting a new column automatically aligns the data by the indexes:

In [47]:
s1 = pd.Series(
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    index=pd.date_range("20130101", periods=10)
)
s1

2013-01-01     1
2013-01-02     2
2013-01-03     3
2013-01-04     4
2013-01-05     5
2013-01-06     6
2013-01-07     7
2013-01-08     8
2013-01-09     9
2013-01-10    10
Freq: D, dtype: int64

Setting values by label:

In [48]:
df.at[dates[0], "A"] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,2.849912,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216
2013-01-06,0.369782,-0.719802,0.583105,-0.918988,-0.083442


Setting values by position:

In [49]:
df.iat[2, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.805152,-0.273527,0.801805,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,-0.250786,-0.320263
2013-01-03,0.060715,0.0,-1.200424,-0.38449,1.198775
2013-01-04,1.572195,-1.235836,0.554144,0.632769,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,2.016544,-0.03216
2013-01-06,0.369782,-0.719802,0.583105,-0.918988,-0.083442


Setting by assigning with a NumPy array:

In [50]:
df.loc[:, "D"] = np.array([6]  * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.805152,-0.273527,6.0,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,6.0,-0.320263
2013-01-03,0.060715,0.0,-1.200424,6.0,1.198775
2013-01-04,1.572195,-1.235836,0.554144,6.0,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,6.0,-0.03216
2013-01-06,0.369782,-0.719802,0.583105,6.0,-0.083442


A where operation with setting:

In [51]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.805152,-0.273527,-6.0,-2.131466
2013-01-02,-1.016615,-1.690678,-0.330468,-6.0,-0.320263
2013-01-03,-0.060715,0.0,-1.200424,-6.0,-1.198775
2013-01-04,-1.572195,-1.235836,-0.554144,-6.0,-0.699379
2013-01-05,-2.028985,-1.236605,-0.628177,-6.0,-0.03216
2013-01-06,-0.369782,-0.719802,-0.583105,-6.0,-0.083442


# Missing data

In [52]:
df1 =  df.reindex(index=dates[0:5], columns=list(df.columns) + ["E"])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.805152,-0.273527,6.0,-2.131466,
2013-01-02,-1.016615,1.690678,0.330468,6.0,-0.320263,
2013-01-03,0.060715,0.0,-1.200424,6.0,1.198775,
2013-01-04,1.572195,-1.235836,0.554144,6.0,0.699379,
2013-01-05,-2.028985,1.236605,-0.628177,6.0,-0.03216,


In [53]:
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.805152,-0.273527,6.0,-2.131466,1.0
2013-01-02,-1.016615,1.690678,0.330468,6.0,-0.320263,1.0
2013-01-03,0.060715,0.0,-1.200424,6.0,1.198775,
2013-01-04,1.572195,-1.235836,0.554144,6.0,0.699379,
2013-01-05,-2.028985,1.236605,-0.628177,6.0,-0.03216,


DataFrame.dropna() drops any rows that have missing data:

In [54]:
df1.dropna(how="any")
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.805152,-0.273527,6.0,-2.131466,1.0
2013-01-02,-1.016615,1.690678,0.330468,6.0,-0.320263,1.0
2013-01-03,0.060715,0.0,-1.200424,6.0,1.198775,
2013-01-04,1.572195,-1.235836,0.554144,6.0,0.699379,
2013-01-05,-2.028985,1.236605,-0.628177,6.0,-0.03216,


In [55]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True
2013-01-05,False,False,False,False,False,True


DataFrame.fillna() fills missing data:

In [56]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.805152,-0.273527,6.0,-2.131466,1.0
2013-01-02,-1.016615,1.690678,0.330468,6.0,-0.320263,1.0
2013-01-03,0.060715,0.0,-1.200424,6.0,1.198775,5.0
2013-01-04,1.572195,-1.235836,0.554144,6.0,0.699379,5.0
2013-01-05,-2.028985,1.236605,-0.628177,6.0,-0.03216,5.0


isna() gets the boolean mask where values are nan:

In [57]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True
2013-01-05,False,False,False,False,False,True


# Operations

Stats

In [58]:
df.mean()

A   -0.173818
B    0.296133
C   -0.105735
D    6.000000
F   -0.111530
dtype: float64

In [59]:
df1.mean()

A   -0.282538
B    0.499320
C   -0.243503
D    6.000000
F   -0.117147
E    1.000000
dtype: float64

In [60]:
df2.mean()

A   -0.841382
B   -0.948012
C   -0.594974
D   -6.000000
F   -0.744248
dtype: float64

Calculate the mean value for each row:

In [61]:
df.mean(axis=1)

2013-01-01    0.880032
2013-01-02    1.336854
2013-01-03    1.211813
2013-01-04    1.517976
2013-01-05    0.909457
2013-01-06    1.229929
Freq: D, dtype: float64

Calculate the mean value for each column:

In [62]:
df.mean(axis=0)

A   -0.173818
B    0.296133
C   -0.105735
D    6.000000
F   -0.111530
dtype: float64

In [63]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [64]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.939285,-1.0,-2.200424,5.0,0.198775
2013-01-04,-1.427805,-4.235836,-2.445856,3.0,-2.300621
2013-01-05,-7.028985,-3.763395,-5.628177,1.0,-5.03216
2013-01-06,,,,,


In [70]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.805152,-0.273527,6.0,-2.131466
2013-01-02,-1.016615,1.690678,0.330468,6.0,-0.320263
2013-01-03,0.060715,0.0,-1.200424,6.0,1.198775
2013-01-04,1.572195,-1.235836,0.554144,6.0,0.699379
2013-01-05,-2.028985,1.236605,-0.628177,6.0,-0.03216
2013-01-06,0.369782,-0.719802,0.583105,6.0,-0.083442


# User defined functions

In [68]:
df.agg(lambda x: np.mean(x) * 5.6)

A    -0.973380
B     1.658343
C    -0.592117
D    33.600000
F    -0.624566
dtype: float64

In [69]:
df.transform(lambda x: x * 101.2)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,81.481337,-27.680939,607.2,-215.704376
2013-01-02,-102.881398,171.09659,33.443353,607.2,-32.410623
2013-01-03,6.144376,0.0,-121.482953,607.2,121.316076
2013-01-04,159.10611,-125.066597,56.079416,607.2,70.777129
2013-01-05,-205.333242,125.144393,-63.571527,607.2,-3.254612
2013-01-06,37.421905,-72.843964,59.010265,607.2,-8.444359


Value Counts

In [71]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    3
1    3
2    0
3    5
4    0
5    5
6    4
7    3
8    6
9    4
dtype: int64

In [72]:
s.value_counts()

3    3
0    2
5    2
4    2
6    1
Name: count, dtype: int64

String Methods

In [74]:
s = pd.Series(["A", "B", "C", "Aaba", np.nan, "CABA", "dog", "cat"])
s

0       A
1       B
2       C
3    Aaba
4     NaN
5    CABA
6     dog
7     cat
dtype: object

In [77]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4     NaN
5    caba
6     dog
7     cat
dtype: object

Merge and Concat

Concatenating pandas objects together row-wise with concat():

In [78]:
df =pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.664658,-0.905743,-0.150523,0.062308
1,0.752049,0.989725,-0.931056,2.750118
2,-0.215457,-0.354555,-1.644645,-0.15872
3,1.216876,1.039777,0.453409,0.141892
4,0.633863,-0.51049,0.917145,0.265762
5,-0.279847,-1.211219,1.716968,1.108839
6,-0.114164,-0.118729,0.547752,1.39932
7,0.707926,0.267899,0.011049,0.22721
8,-1.640322,-0.609495,-0.20936,0.554267
9,0.330763,-0.836026,1.722202,-0.474782


In [83]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  0.664658 -0.905743 -0.150523  0.062308
 1  0.752049  0.989725 -0.931056  2.750118
 2 -0.215457 -0.354555 -1.644645 -0.158720,
           0         1         2         3
 3  1.216876  1.039777  0.453409  0.141892
 4  0.633863 -0.510490  0.917145  0.265762
 5 -0.279847 -1.211219  1.716968  1.108839
 6 -0.114164 -0.118729  0.547752  1.399320,
           0         1         2         3
 7  0.707926  0.267899  0.011049  0.227210
 8 -1.640322 -0.609495 -0.209360  0.554267
 9  0.330763 -0.836026  1.722202 -0.474782]

In [84]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.664658,-0.905743,-0.150523,0.062308
1,0.752049,0.989725,-0.931056,2.750118
2,-0.215457,-0.354555,-1.644645,-0.15872
3,1.216876,1.039777,0.453409,0.141892
4,0.633863,-0.51049,0.917145,0.265762
5,-0.279847,-1.211219,1.716968,1.108839
6,-0.114164,-0.118729,0.547752,1.39932
7,0.707926,0.267899,0.011049,0.22721
8,-1.640322,-0.609495,-0.20936,0.554267
9,0.330763,-0.836026,1.722202,-0.474782
