In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

In [3]:
s[s > 0.5]

a    0.686408
c    0.701709
dtype: float64

In [4]:
s[[2, 4]]

c    0.701709
e   -0.665657
dtype: float64

In [5]:
np.exp(s)

a    1.986568
b    0.228883
c    2.017197
d    0.984710
e    0.513936
dtype: float64

In [6]:
s["a"]

0.6864083920883969

In [7]:
# s["f"] # KeyError except

# Use the get method instead
s.get("f", np.nan)

nan

In [22]:
s2 = pd.DataFrame(np.random.rand(5, 2))
s2

Unnamed: 0,0,1
0,0.218118,0.415615
1,0.046772,0.743838
2,0.163203,0.208114
3,0.557837,0.519554
4,0.214126,0.081832


# From dict of Series or dicts

The resulting **index** will be the **union** of the indexes of the various Series. If there are any nested dicts, these will first be converted to Series. If no columns are passed, the columns will be the ordered list of dict keys.

In [54]:
d = {
  "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
  "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"])
}

df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,-0.732758,0.72716
b,0.827219,0.63847
c,0.223872,0.028091
d,,-0.972016


In [16]:
pd.DataFrame(d, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,0.6656
b,0.618542,-1.215648
a,0.731354,-0.434867


In [19]:
pd.DataFrame(d, index=["d", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,0.6656,
a,-0.434867,


In [20]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [21]:
df.columns

Index(['one', 'two'], dtype='object')

### Note: When passing two or more arrays or ndarrays, make sure that their length is the same

In [28]:
pd.DataFrame({"one": np.random.rand(3), "two": np.random.rand(3)})

Unnamed: 0,one,two
0,0.23792,0.036702
1,0.994622,0.635829
2,0.300992,0.114898


In [53]:
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
pd.DataFrame(data2, index=["first", "second"])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [55]:
df.insert(1, "negative", df["one"] < 0)

In [56]:
# [fn for fn in dir(df) if not fn.startswith("_")]

df["three"] = df["one"] * 10
df

Unnamed: 0,one,negative,two,three
a,-0.732758,True,0.72716,-7.327579
b,0.827219,False,0.63847,8.272193
c,0.223872,False,0.028091,2.238715
d,,False,-0.972016,


In [61]:
# Use assign method: returns a copy of the df

df.assign(ratio=lambda x: (x["two"] / x["one"] - 5))

Unnamed: 0,one,negative,two,three,ratio
a,-0.732758,True,0.72716,-7.327579,-5.99236
b,0.827219,False,0.63847,8.272193,-4.228173
c,0.223872,False,0.028091,2.238715,-4.874524
d,,False,-0.972016,,


In [62]:
help(df.query)

Help on method query in module pandas.core.frame:

query(expr: 'str', inplace: 'bool' = False, **kwargs) method of pandas.core.frame.DataFrame instance
    Query the columns of a DataFrame with a boolean expression.
    
    Parameters
    ----------
    expr : str
        The query string to evaluate.
    
        You can refer to variables
        in the environment by prefixing them with an '@' character like
        ``@a + b``.
    
        You can refer to column names that are not valid Python variable names
        by surrounding them in backticks. Thus, column names containing spaces
        or punctuations (besides underscores) or starting with digits must be
        surrounded by backticks. (For example, a column named "Area (cm^2)" would
        be referenced as ```Area (cm^2)```). Column names which are Python keywords
        (like "list", "for", "import", etc) cannot be used.
    
        For example, if one of your columns is called ``a a`` and you want
        to sum it

In [65]:
df.query("negative").assign(cool=lambda x: x["two"] / x["three"])

Unnamed: 0,one,negative,two,three,cool
a,-0.732758,True,0.72716,-7.327579,-0.099236


In [86]:
df.loc["b":"d", "one":"two"]

Unnamed: 0,one,negative,two
b,0.827219,False,0.63847
c,0.223872,False,0.028091
d,,False,-0.972016


In [88]:
df.T

Unnamed: 0,a,b,c,d
one,-0.732758,0.827219,0.223872,
negative,True,False,False,False
two,0.72716,0.63847,0.028091,-0.972016
three,-7.327579,8.272193,2.238715,


In [89]:
df.to_string()

'        one  negative       two     three\na -0.732758      True  0.727160 -7.327579\nb  0.827219     False  0.638470  8.272193\nc  0.223872     False  0.028091  2.238715\nd       NaN     False -0.972016       NaN'

In [107]:
dates = pd.date_range("20220101", periods=5)
date_df = pd.DataFrame(np.random.rand(5, 3), index=dates, columns=list("abc"))
date_df

Unnamed: 0,a,b,c
2022-01-01,0.283531,0.562453,0.470664
2022-01-02,0.545342,0.335055,0.234532
2022-01-03,0.989477,0.600014,0.370401
2022-01-04,0.897044,0.487149,0.336388
2022-01-05,0.258644,0.322419,0.197952


In [108]:
date_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2022-01-01 to 2022-01-05
Freq: D
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       5 non-null      float64
 1   b       5 non-null      float64
 2   c       5 non-null      float64
dtypes: float64(3)
memory usage: 160.0 bytes


In [109]:
date_df.tail(2)

Unnamed: 0,a,b,c
2022-01-04,0.897044,0.487149,0.336388
2022-01-05,0.258644,0.322419,0.197952


In [110]:
date_df.sort_index(axis=1, ascending=False)

Unnamed: 0,c,b,a
2022-01-01,0.470664,0.562453,0.283531
2022-01-02,0.234532,0.335055,0.545342
2022-01-03,0.370401,0.600014,0.989477
2022-01-04,0.336388,0.487149,0.897044
2022-01-05,0.197952,0.322419,0.258644


In [114]:
date_df[date_df["c"] > 0.25]

Unnamed: 0,a,b,c
2022-01-01,0.283531,0.562453,0.470664
2022-01-03,0.989477,0.600014,0.370401
2022-01-04,0.897044,0.487149,0.336388


In [125]:
# dropna, fillna, isna
# df.dropna(axis=0)
# df.fillna(0.5)
df.isna()

Unnamed: 0,one,negative,two,three
a,False,False,False,False
b,False,False,False,False
c,False,False,False,False
d,True,False,False,True


In [130]:
# apply
date_df.apply(np.sum)

a    2.974038
b    2.307090
c    1.609936
dtype: float64

In [132]:
date_df.apply(lambda x: x.max() - x.min())

a    0.730833
b    0.277595
c    0.272712
dtype: float64