< [Online Version Python Data Science Handbook](https://github.com/jakevdp/PythonDataScienceHandbook)| [Menu](https://)>
# 3. Data Manipulation with Pandas 
* Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame.
* In this chapter, we will focus on the mechanics of using **Series**, **DataFrame**, and related structures effectively
* More detailed documentation, along with tutorials and other resources, can be found at http://pandas.pydata.org/.

## Data Indexing and Selection
* Data Selection in Series
* Data Selection in DataFrame

### Data Selection in Series

In [2]:
# Series as dictionary
import pandas as pd

In [5]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
    index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [6]:
data['b']

0.5

In [7]:
'a' in data

True

In [8]:
data.index
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [9]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [10]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [11]:
# Select all values as array
data.values

array([0.25, 0.5 , 0.75, 1.  , 1.25])

In [12]:
# Select rows by python index or explicit index from-to
data[0] 
data[0:3] 
data.loc['a':'c']
data.iloc[0:3]

a    0.25
b    0.50
c    0.75
dtype: float64

In [13]:
# Select rows by explicit index
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [14]:
# Select rows by value
data[(data > 0.3) & (data < 0.8)] 

b    0.50
c    0.75
dtype: float64

In [15]:
# Select row value by python index or explicit index
data['c']
data.loc['c']
data.iloc[2]

0.75

### Data Selection in DataFrame

In [3]:
population_dict = {'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135}
population = pd.Series(population_dict)
area_dict = {'California': 423967, 
    'Texas': 695662, 
    'New York': 141297,
    'Florida': 170312, 
    'Illinois': 149995}
area = pd.Series(area_dict)
states = pd.DataFrame({'population': population,
    'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [17]:
# Select all values as array
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]])

In [18]:
# Select rows by python index from-to > array
states.values[0]
states.values[0:3]

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297]])

In [6]:
# SELECT BY ROWS(by python index)
#      SQL > SELECT * FROM states WHERE index IN (1,2,3) > pandas.core.frame.DataFrame
states[0:3]
states.iloc[0:3]
states.iloc[:3]
states.iloc[[1,2,3]]
# SELECT BY ROWS(by python index)
#      SQL > SELECT * FROM states WHERE index IN ('California','Texas','New York') > pandas.core.frame.DataFrame
states['California':'New York']
states.loc[:'New York']
states.loc['California':'New York']
states.loc[['California','Texas','New York']]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297


In [20]:
# SELECT BY COLUMN(by python index)
#      SQL > SELECT area > pandas.core.series.Series
states.area 
states['area']
# SELECT BY COLUMN(by python index)
#      SQL > SELECT area > pandas.core.frame.DataFrame
states[['area']] 
# SELECT BY COLUMNS(by python index)
#      SQL > SELECT 1,2 > pandas.core.frame.DataFrame
states.iloc[:, 1:2]
states.iloc[:, :2]
states.iloc[:, [0,1]]
# SELECT BY COLUMNS(by column name)
#      SQL > SELECT population, area > pandas.core.frame.DataFrame
states[['population','area']]
states.loc[:, :'area']
states.loc[:, 'population':'area']
states.loc[:, ['population', 'area']]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [14]:
# SELECT BY ROWS & COLUMNS(by python index) 
#      SQL > SELECT 1,2 FROM states WHERE index IN (1,2,3) > pandas.core.frame.DataFrame
states.iloc[0:3, 0:2]
states.iloc[:3, :2]
states.iloc[[1,2,3], [0,1]]
# ROWS & COLUMNS(by column name) > SELECT population, area WHERE index IN ('California','Texas','New York') > pandas.core.frame.DataFrame
states.loc[:'New York', :'area']
states.loc['California':'New York', 'population':'area']
states.loc[['California', 'Texas', 'New York'], ['population', 'area']]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297


## Columns

### Columns > cast

In [27]:
#https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html
# by date > 
#     df = df.astype({'date': 'datetime64', 'to_date': 'datetime64'})
#     df = df.astype({'date': 'datetime64[ns]'})
#     df['date'] = df['date'].astype('datetime64[ns]')
#     df['date'] = pd.to_datetime(df["date"])
#     df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d %H:%M:%S') > str
#     df['date'] = pd.to_datetime(pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d %H:%M:%S')) > datetime
states[states.columns[0]].dtypes
states.dtypes 

population      int64
area            int64
density       float64
dtype: object

### Columns > add / modify 

In [22]:
# ADD COLUMN > SELECT (Expression) AS <column> ...
# MODIFY COLUMN > UPDATE <column> = (Expression) ...
# by date > 
#     df['date'] = df['date'].dt.weekday
#     df['date'] = df['date'].dt.strftime('%Y-%m-%d') > str
#     df['date'] = df['date'].dt.total_seconds()
# by string
#    df['str'] = df['str'].str.upper()
# by lambda
#    df['lambda'] = df['lambda'].apply(lambda x: str(x).upper()) 
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


### Columns > filter

In [23]:
# FILTER COLUMN > SELECT ... WHERE <column> = <condition>
# by conditional
#    & (AND), | (OR), >= (MAJOR OR EQUAL), >= (MINOR OR EQUAL), == (EQUAL), != (NOT EQUAL)
#    df[(df["field"] == "XXX") | (df["field"] == "YYY")] 
# by negative conditional
#    df[~df["field"].str.endswith('XXX')]
# by date > 
#    df.loc[df['field'] >= '2021-03-05'] 
#    df.loc[df['field'].dt.weekday == 2] 
#    df[df['field'].dt.strftime('%Y-%m-%d') = '2021-03-05'] 
#    df[df['field'].dt.total_seconds() >= 12345] 
# by string
#    df[df["field"].str.startswith("XXX")] 
#    df[df["field"].str.endswith('XXX')]
#    df[df["field"].str.contains("XXX|YYY")] 
#    df[df["field"].str.len() != 10] 
#    df[df["field"].astype(str).str.len() != 10] 
#    df[df["field"].isin(["XXX", "YYY"])]
# by lambda
#    df[df["field"].apply(lambda x: len(str(x)) != 10] 
states[(states.density > 90) & (states.density < 140)]
states.loc[(states.density > 90) & (states.density < 140)]
states.loc[(states.density > 90) & (states.density < 140), :]
states.loc[(states.density > 90) & (states.density < 140), :'density']
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html
states.query('density > 90 and density < 140')

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121


In [162]:
# MODIFY BY ROWS & COLUMNS
#      SQL > UPDATE density = 90 FROM states WHERE <condition> > pandas.core.frame.DataFrame
states.iloc[0, 2] = 90
states.at['California','density'] = 90
states.loc['California', 'density'] = 90
states.loc[states["density"] == 90, 'density'] = 90
states

Unnamed: 0,population,area,density
California,38332521,423967,90.0
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [89]:
# CONVERT COLUMN(by python index) > SELECT CAST(area) > pandas.core.series.Series
# TIMESTAMP
# https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.html
from datetime import datetime

# states['date'] = pd.date_range(start='01/01/2021', periods=len(states), freq='D') # > datetime64[ns]
# states['dates'] = pd.to_datetime("01/01/2021") # > datetime64[ns]
# states['dates'] = pd.Timestamp("01/01/2021") # > datetime64[ns]

# df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d')
#states['timestamp'] = dt.datetime.now()
dt.datetime.now()
pd.to_datetime('now')
datetime.datetime.utcnow().replace(microsecond=0)
datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')

# Convert the date to datetime64 
# df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d')
# How to manipulate textual data
# df[df["process_timestamp"].dt.strftime('%Y-%m-%d') == '2021-03-16']

states['dates'] = datetime.date(2030, 2, 2)
states
states.dtypes

TypeError: descriptor 'date' for 'datetime.datetime' objects doesn't apply to a 'int' object

In [16]:
# Pivot table
states.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


## GROUP BY

In [28]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html

In [None]:
a = df.groupby(["action","status"])
df.groupby(["action","status"]).groups
df.groupby(["action","status"]).groups.keys()
df.groupby(["action","status"]).count()
df.groupby(["action"])['status'].count()
df.groupby(["action"])['status'].value_counts()
df.groupby(["action","status"])['jobId'].agg(['count'])
df.groupby(["action"])['status'].agg(['count'])
df.groupby(["action","status"]).agg({'jobId': 'count'})
df.groupby(["action","status"]).agg({'diffDateSeconds': ['min', 'max', 'mean']})

d = df.groupby(["action","status"]).agg({'jobId': 'count', 'diffDateSeconds': ['min', 'max', 'mean']})

## MERGING

In [13]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2"],
        "B": ["B0", "B1", "B2"],
        "C": ["C0", "C1", "C2"]
    },
    index=[0, 1, 2],
)
df2 = pd.DataFrame(
    {
        "A": ["A3", "A4", "A5"],
        "B": ["B3", "B4", "B5"],
        "C": ["C3", "C4", "C5"]
    },
    index=[0, 1, 2],
)

In [14]:
pd.concat([df1, df2])

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5


In [18]:
# Ignoring the index
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4
5,A5,B5,C5


In [25]:
# Adding MultiIndex keys
df = pd.concat([df1, df2], keys=["df1", "df2"]); df

Unnamed: 0,Unnamed: 1,A,B,C
df1,0,A0,B0,C0
df1,1,A1,B1,C1
df1,2,A2,B2,C2
df2,0,A3,B3,C3
df2,1,A4,B4,C4
df2,2,A5,B5,C5


In [26]:
# Select by Key
df.loc["df2"]

Unnamed: 0,A,B,C
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5


## ORDER BY

In [1]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html
#     result.sort_values(by=['col1', 'col2']) > multiple columns
#     result.sort_values(by='col1', ascending=False) > Sort Descending

In [4]:
df = pd.DataFrame({
    'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
    'col2': [2, 1, 9, 8, 7, 4]
})
df

NameError: name 'np' is not defined