#### Head and Tail

In [6]:
import pandas as pd
import numpy as np

index = pd.date_range("1/1/2000", periods=8)

s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])

In [3]:
long_series = pd.Series(np.random.randn(100))
long_series.head()

0    1.276531
1   -0.056231
2    0.050646
3   -0.476657
4    0.234708
dtype: float64

In [4]:
long_series.tail(3)

97   -1.062238
98    0.610129
99    0.572050
dtype: float64

#### Attributes and underlying data

- **Shape** gives the axis dimensions of the object, consistent with ndarray
- **Axis labels**
    - **Series** index (only axis)
    - **DataFrame** index (rows) and columns

In [8]:
# Note these attribute can be safely assigned to!
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.756011,0.888648,0.229164
2000-01-02,-0.696717,-1.264951,0.683883


In [10]:
df.columns =[x.lower() for x in df.columns]

In [11]:
df

Unnamed: 0,a,b,c
2000-01-01,0.756011,0.888648,0.229164
2000-01-02,-0.696717,-1.264951,0.683883
2000-01-03,-0.209919,0.388207,-1.59165
2000-01-04,0.525379,-0.469444,0.841007
2000-01-05,-0.396997,0.211629,-1.058407
2000-01-06,0.375643,-0.179265,0.128292
2000-01-07,0.804397,-0.682044,1.801211
2000-01-08,0.268061,0.758027,-0.822169


In [12]:
s.array

<PandasArray>
[ -0.7887186777971024, -0.38873801668116653,   0.2967024791019257,
  -1.5706427483021568,  -0.9583127272518366]
Length: 5, dtype: float64

In [13]:
s.index.array

<PandasArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [14]:
s.to_numpy()

array([-0.78871868, -0.38873802,  0.29670248, -1.57064275, -0.95831273])

In [15]:
np.asarray(s)

array([-0.78871868, -0.38873802,  0.29670248, -1.57064275, -0.95831273])

#### Accelerated Operations

Pandas has support for accelerating certain types of binary numerical and boolean operations using the **numexpr** library and **bottleneck** libraries

- **numexpr** uses smart chucking, caching and multiple cores
- **bottleneck** is a set of specialized cython routines that are especially fast when dealing with arrays that have nans

#### Missing data/ operations with fill values

In [25]:
df.iloc[:1]=np.nan

In [30]:
df.fillna('0')

Unnamed: 0,a,b,c
2000-01-01,0.0,0.0,0.0
2000-01-02,-0.696717,-1.26495,0.683883
2000-01-03,-0.209919,0.388207,-1.59165
2000-01-04,0.525379,-0.469444,0.841007
2000-01-05,-0.396997,0.211629,-1.05841
2000-01-06,0.375643,-0.179265,0.128292
2000-01-07,0.804397,-0.682044,1.80121
2000-01-08,0.268061,0.758027,-0.822169


#### Flexible comparisions

Series and DataFrame have the binary comparison methods eq, ne, lt, gt, le, and ge

#### Boolean reductions
you can apply reductions: empty, any(), all() and bool()

In [38]:
(df>0).any()

a    True
b    True
c    True
dtype: bool

In [39]:
(df>0).any().any()

True

In [40]:
df.empty

False

In [41]:
pd.Series([True]).bool()

True

In [42]:
pd.Series([False]).bool()

False

In [44]:
# comparing if objects are equivalent
df + df == df *2

Unnamed: 0,a,b,c
2000-01-01,False,False,False
2000-01-02,True,True,True
2000-01-03,True,True,True
2000-01-04,True,True,True
2000-01-05,True,True,True
2000-01-06,True,True,True
2000-01-07,True,True,True
2000-01-08,True,True,True


In [45]:
np.nan == np.nan

False

In [46]:
(df + df).equals(df *2)

True

In [47]:
df1 = pd.DataFrame({"col": ["foo", 0, np.nan]})
df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0])
df1.equals(df2)

False

In [49]:
df1.equals(df2.sort_index())

True

In [50]:
pd.Series(["foo", "bar", "bax"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [51]:
pd.Index(["foo", "bar", "baz"]) == "foo"

array([ True, False, False])

In [52]:
pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [53]:
pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [56]:
# pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar'])
#ValueError: Series lengths must match to compare

#Note that this is different from the NumPy behavior where a comparison can be broadcast:
np.array([1, 2, 3]) == np.array([2])


array([False,  True, False])

In [58]:
np.array([1, 2, 3]) == np.array([1, 2])

  np.array([1, 2, 3]) == np.array([1, 2])


False

### Descriptive Statistics

- **Series** no axis argument needed
- **DataFrame** "index" (axis=0, default), "columns"(axis=1)

In [59]:
df

Unnamed: 0,a,b,c
2000-01-01,,,
2000-01-02,-0.696717,-1.264951,0.683883
2000-01-03,-0.209919,0.388207,-1.59165
2000-01-04,0.525379,-0.469444,0.841007
2000-01-05,-0.396997,0.211629,-1.058407
2000-01-06,0.375643,-0.179265,0.128292
2000-01-07,0.804397,-0.682044,1.801211
2000-01-08,0.268061,0.758027,-0.822169


In [60]:
df.cumsum()

Unnamed: 0,a,b,c
2000-01-01,,,
2000-01-02,-0.696717,-1.264951,0.683883
2000-01-03,-0.906636,-0.876744,-0.907766
2000-01-04,-0.381257,-1.346188,-0.06676
2000-01-05,-0.778254,-1.13456,-1.125167
2000-01-06,-0.402611,-1.313824,-0.996874
2000-01-07,0.401786,-1.995869,0.804337
2000-01-08,0.669847,-1.237841,-0.017833


| Functions        | Description |  
| :------------- |:------------- |
| count     | Number of non-NA observations |
| sum | Sum of values |
| mean | Mean of values |
| mad | Mean absolute deviation |
| median | Arthemetic median of values |
| min | Minimum |
| max | Maximum |
| mode | Mode |
| abs | Absolute Value |
| prod | Product of values |
| std | Bessel-corrected sample standard deviation |
| var | Unbiased variance |
| sem | Standard error of the mean |
| skew | Sample skewness (3rd moment)|
| kurt | Sample kurtosis (4th moment)|
| quantile | Sample quantile (value at %) |
| cumsum | Cumulative sum |
| cumprod | Cumulative Product |
| cummax| Cumulative maximum |
| cummin | Cumulative minimum |

    

**Series.nunique()** will return the number of unique non-NA values in a Series

#### Summarizing data: describe

In [62]:
series = pd.Series(np.random.randn(1000))
series[::2] = np.nan
series.describe()

count    500.000000
mean       0.056724
std        1.022762
min       -3.074200
25%       -0.659944
50%        0.077540
75%        0.744743
max        3.009486
dtype: float64

In [65]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a","b","c","d","e"])
frame.iloc[::2] = np.nan
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,-0.010313,0.043081,-0.112117,0.027214,0.026839
std,1.067429,1.016523,0.959228,1.036923,1.03313
min,-2.828518,-3.606573,-3.596823,-3.529434,-3.179617
25%,-0.734314,-0.679049,-0.805747,-0.602474,-0.680054
50%,-0.016794,0.05688,-0.091124,0.050236,0.005555
75%,0.702736,0.74707,0.556082,0.679891,0.704266
max,3.572936,2.799907,3.111354,3.435531,2.899252


In [66]:
# selecting specific percentiles
series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])

count    500.000000
mean       0.056724
std        1.022762
min       -3.074200
5%        -1.621950
25%       -0.659944
50%        0.077540
75%        0.744743
95%        1.756154
max        3.009486
dtype: float64

In [68]:
s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [72]:
frame = pd.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)})
frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [70]:
frame.describe(include=["object"])

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [73]:
frame.describe(include=["number"])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [74]:
frame.describe(include="all")

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


#### Index of min/max values

In [81]:
# the idxmin() and idxmax() functions on Series and DataFrame compute the index labels with the minimum and maximum

s1 = pd.Series(np.random.randn(5))
s1.idxmin(), s1.idxmax()

(4, 3)

In [82]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=["A","B","C"])
df1

Unnamed: 0,A,B,C
0,-0.217985,-0.509143,-1.155588
1,-0.484041,1.751487,-1.879526
2,1.154208,-1.64298,1.152199
3,1.923343,-1.090456,-0.242794
4,-1.45049,-1.502287,-0.361095


In [83]:
df1.idxmin(axis=0)

A    4
B    2
C    1
dtype: int64

In [84]:
df1.idxmax(axis=1)

0    A
1    B
2    A
3    A
4    C
dtype: object

- idxmin and idxmax are called argmin and argmax in NumPy

#### Value counts (histogramming)/ mode

In [86]:
data = np.random.randint(0, 7, size=50)
data

array([1, 4, 1, 2, 3, 3, 2, 5, 5, 0, 5, 3, 5, 4, 0, 0, 2, 1, 3, 4, 4, 2,
       4, 2, 6, 2, 1, 6, 6, 3, 1, 0, 0, 0, 5, 3, 1, 5, 1, 0, 6, 1, 0, 4,
       0, 3, 6, 4, 1, 5])

In [87]:
s=pd.Series(data)

In [88]:
s.value_counts()

1    9
0    9
5    7
4    7
3    7
2    6
6    5
dtype: int64

In [89]:
pd.value_counts(data)

1    9
0    9
5    7
4    7
3    7
2    6
6    5
dtype: int64

In [99]:
# The value_counts method can be used to count combinations across multiple columns.

data={"a": [1, 2, 3, 4], "b": ["x","x","y","y"]}
frame = pd.DataFrame(data)
# New in version 1.1.0 frame.value_counts()

In [100]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()

0    3
1    7
dtype: int64

#### Discretization and quantiling

In [101]:
# Continuous values can be discretized using the cut() (bins based on values) 
#and qcut() (bins base don sample quantiles) functions:
arr = np.random.randn(20)
arr

array([-0.04787572,  0.19976485, -1.86348071,  1.35890545,  1.45568526,
       -0.37550015, -0.89017827,  1.09192389,  0.83077009, -1.93937186,
        1.61628835,  0.98398171,  1.23894922, -0.44915946,  3.34038984,
        0.8770831 ,  0.54767982, -1.46789201,  0.01565421,  0.38534173])

In [104]:
factor = pd.cut(arr,4)
factor

[(-0.619, 0.701], (-0.619, 0.701], (-1.945, -0.619], (0.701, 2.02], (0.701, 2.02], ..., (0.701, 2.02], (-0.619, 0.701], (-1.945, -0.619], (-0.619, 0.701], (-0.619, 0.701]]
Length: 20
Categories (4, interval[float64]): [(-1.945, -0.619] < (-0.619, 0.701] < (0.701, 2.02] < (2.02, 3.34]]

In [106]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])
factor

[(-1, 0], (0, 1], (-5, -1], (1, 5], (1, 5], ..., (0, 1], (0, 1], (-5, -1], (0, 1], (0, 1]]
Length: 20
Categories (4, interval[int64]): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [114]:
# qcut() computes samples.
# for example, we could slice up some normally distributed data into equal 0
arr = np.random.randn(30)

In [117]:
factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1])
factor

[(-0.212, 0.695], (0.695, 2.544], (0.695, 2.544], (0.695, 2.544], (-0.863, -0.212], ..., (-0.212, 0.695], (-0.863, -0.212], (-1.708, -0.863], (-0.212, 0.695], (-0.212, 0.695]]
Length: 30
Categories (4, interval[float64]): [(-1.708, -0.863] < (-0.863, -0.212] < (-0.212, 0.695] < (0.695, 2.544]]