In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
len(dir(pd)), len(dir(pd.DataFrame)), len(dir(pd.Series))

(140, 427, 425)

## IO
https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

In [6]:
iris = pd.read_csv('data/iris.csv')
iris.head() #n=5 by default

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [9]:
#metadata
iris.columns, iris.index, iris.dtypes

(Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object'),
 RangeIndex(start=0, stop=150, step=1),
 SepalLength    float64
 SepalWidth     float64
 PetalLength    float64
 PetalWidth     float64
 Name            object
 dtype: object)

In [10]:
len(iris)

150

# Datatypes
https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#basics-dtypes


https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#text-types

In [11]:
## Access
iris.SepalLength
#OR 
iris['SepalLength']  #Series
#Multicolumn
iris[['SepalLength','SepalWidth']] #DF

Unnamed: 0,SepalLength,SepalWidth
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


In [13]:
# Slice , loc - with ids, iloc- integer based
iris.loc[0:3, ['SepalLength', 'SepalWidth']] #Note 0:3 is rowid, inclusive

Unnamed: 0,SepalLength,SepalWidth
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1


In [14]:
iris.iloc[0:4, [0,1]] #0:4 like python- end exclusive

Unnamed: 0,SepalLength,SepalWidth
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1


In [15]:
iris.iloc[:,:]  #full DF

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [18]:
# Boolean accessing - only loc - and &, or | , not ~
iris.loc[iris.Name == 'Iris-virginica', :]
iris.loc[iris.Name == 'Iris-virginica', ['SepalLength', 'SepalWidth']]
iris.loc[(iris.Name == 'Iris-virginica') | (iris.Name == 'Iris-setosa'), ['SepalLength', 'SepalWidth']]

Unnamed: 0,SepalLength,SepalWidth
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


In [24]:
#Create new Col- use numpy methods - elementwise
iris['dummy'] = np.abs(iris.SepalLength - 2 * iris.SepalWidth + 1)
iris.dummy
#delete
iris.drop(columns=['dummy'], inplace=True)
iris.columns

Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object')

In [28]:
# methods
iris.Name.unique()
#if col is str, many str related methods are present
dir(iris.Name.str)
iris.Name.str.lower()
#for datetime- lets check later
iris.Name.value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Name, dtype: int64

In [30]:
#DF methods are similar to Series
iris.sort_values(by=['Name', 'SepalLength'])

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
13,4.3,3.0,1.1,0.1,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
38,4.4,3.0,1.3,0.2,Iris-setosa
42,4.4,3.2,1.3,0.2,Iris-setosa
41,4.5,2.3,1.3,0.3,Iris-setosa
...,...,...,...,...,...
117,7.7,3.8,6.7,2.2,Iris-virginica
118,7.7,2.6,6.9,2.3,Iris-virginica
122,7.7,2.8,6.7,2.0,Iris-virginica
135,7.7,3.0,6.1,2.3,Iris-virginica


In [32]:
# Axis 
iris.sum(axis=0) , iris.sum() #default by col

(SepalLength                                                876.5
 SepalWidth                                                 458.1
 PetalLength                                                563.8
 PetalWidth                                                 179.8
 Name           Iris-setosaIris-setosaIris-setosaIris-setosaIr...
 dtype: object,
 SepalLength                                                876.5
 SepalWidth                                                 458.1
 PetalLength                                                563.8
 PetalWidth                                                 179.8
 Name           Iris-setosaIris-setosaIris-setosaIris-setosaIr...
 dtype: object)

In [33]:
iris.sum(axis=1) #rowwise 

0      10.2
1       9.5
2       9.4
3       9.4
4      10.2
       ... 
145    17.2
146    15.7
147    16.7
148    17.3
149    15.8
Length: 150, dtype: float64

In [None]:
# Aggregation
grp = iris.groupby('Name')