In [1]:
import numpy as np

# Pandas Basics

In [2]:
import pandas as pd

> Pandas allows for fast analysis, data cleaning and preparation.
> Pandas excels in performance and productivity
* conda install pandas
_OR_
* pip install pandas 

## Series

In [3]:
# Series is built on top of NumPy array

labels = ['a', 'b', 'c']
my_data = [1,2,3]

pd.Series(data=my_data, index=labels)

a    1
b    2
c    3
dtype: int64

In [4]:
# We can also pass Dictionary as data to Series
# In case of a dictionary, Pandas will automatically convert 'Key' as Index_Label and
# 'Value' as the corressponding Series Value

dic = { "First Name": "Rakshith", "Last Name" : "Ramagiri"}

In [5]:
dic

{'First Name': 'Rakshith', 'Last Name': 'Ramagiri'}

In [6]:
pd.Series(data=dic) # NOTE: Pass only the dictionary.

First Name    Rakshith
Last Name     Ramagiri
dtype: object

In [7]:
# we can also pass built-in function as data references to pd.Series

pd.Series(data=[sum, max, min])

0    <built-in function sum>
1    <built-in function max>
2    <built-in function min>
dtype: object

In [8]:
# Series Vectorized Operations tries to add elements by matching their index_labels

In [9]:
ser1 = pd.Series(data=[5,10,16], index=['a','b','c'])

In [10]:
ser2 = pd.Series(data=[3,7,9], index=['b','c','d'])

In [11]:
ser1 + ser2 # will add all matching index_label values, 
            # if no matching index_label is found, then it returns 'NaN'

a     NaN
b    13.0
c    23.0
d     NaN
dtype: float64

In [12]:
# NOTE: all integer types are automatically converted to 'float64' type

## DataFrames

In [14]:
from numpy.random import randn

In [15]:
df = pd.DataFrame(data=randn(5,4), index=['Row A','Row B','Row C','Row D','Row E'], columns=['Col 1','Col 2','Col 3','Col 4'])

In [16]:
df

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,-0.324043,0.188473,-1.049823,-0.791464
Row B,-0.769422,-0.738268,-2.106194,-0.633131
Row C,-0.451459,-0.832554,1.12696,0.505772
Row D,1.008664,0.487098,0.041811,-0.107324
Row E,1.304048,0.18824,2.033199,-0.514339


In [17]:
# DataFrame is simply a bunch of Series that share the same indices

In [18]:
df['Col 1']

Row A   -0.324043
Row B   -0.769422
Row C   -0.451459
Row D    1.008664
Row E    1.304048
Name: Col 1, dtype: float64

In [19]:
type(df['Col 1']) # we can confirm the above statement using 'type()'

pandas.core.series.Series

In [20]:
type(df)

pandas.core.frame.DataFrame

In [22]:
# Selecting only a few columns
df[['Col 2', 'Col 4']]

Unnamed: 0,Col 2,Col 4
Row A,0.188473,-0.791464
Row B,-0.738268,-0.633131
Row C,-0.832554,0.505772
Row D,0.487098,-0.107324
Row E,0.18824,-0.514339


In [24]:
# Creating new columns inside an existing DataFrame.

df['new'] = df['Col 3'] + df['Col 4'] # similar to creating a new 'Key-Value' pair in dictionary

In [25]:
df

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,new
Row A,-0.324043,0.188473,-1.049823,-0.791464,-1.841287
Row B,-0.769422,-0.738268,-2.106194,-0.633131,-2.739326
Row C,-0.451459,-0.832554,1.12696,0.505772,1.632732
Row D,1.008664,0.487098,0.041811,-0.107324,-0.065513
Row E,1.304048,0.18824,2.033199,-0.514339,1.51886


In [27]:
# we can also DROP existing columns in DataFrame.

df.drop('new') # axis = 0 # by Default, DataFrame axis is set to ROW,
               # we need to change it to COLUMN to delete columns i.e, axis = 1

ValueError: labels ['new'] not contained in axis

In [28]:
df.drop('new', axis=1) # this doesn't actually affect the DataFrame. Because 'drop' is not 'inplace' by default.

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,-0.324043,0.188473,-1.049823,-0.791464
Row B,-0.769422,-0.738268,-2.106194,-0.633131
Row C,-0.451459,-0.832554,1.12696,0.505772
Row D,1.008664,0.487098,0.041811,-0.107324
Row E,1.304048,0.18824,2.033199,-0.514339


In [29]:
df.drop('new', axis=1, inplace=True) # this permanentely affects the DataFrame.

In [31]:
df # changes permanent

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,-0.324043,0.188473,-1.049823,-0.791464
Row B,-0.769422,-0.738268,-2.106194,-0.633131
Row C,-0.451459,-0.832554,1.12696,0.505772
Row D,1.008664,0.487098,0.041811,-0.107324
Row E,1.304048,0.18824,2.033199,-0.514339


In [32]:
df.shape # to show DataFrame dimensions

(5, 4)

In [33]:
# as a result 'AXIS = 0' refers to 'ROWS' and 'AXIS = 1' refers to 'COLUMNS'

### Selecting ROWS in DataFrame.

In [35]:
# Label-based ROW selection

df.loc['Row A'] # 'loc' is a DataFrame method, yet it uses square_brackets '[]' weird??

Col 1   -0.324043
Col 2    0.188473
Col 3   -1.049823
Col 4   -0.791464
Name: Row A, dtype: float64

In [36]:
# as we can observe, the returned item is also a pd.Series object

In [37]:
# Index-based ROW selection

df.iloc[0]

Col 1   -0.324043
Col 2    0.188473
Col 3   -1.049823
Col 4   -0.791464
Name: Row A, dtype: float64

### Selecting Single Values in DataFrame

In [39]:
# Obtaining a single value.

df.loc['Row A', 'Col 1']

-0.3240428992191905

In [40]:
# Obtaining sub-DataFrame from a DataFrame.

df.loc[ ['Row A', 'Row D'] , ['Col 2', 'Col 3']] # loc[ [ROW_SELECTION] , [COLUMN_SELECTION] ]

Unnamed: 0,Col 2,Col 3
Row A,0.188473,-1.049823
Row D,0.487098,0.041811
