In [1]:
#Introduction to Pandas
import pandas as pd

# Creating Series Data
- A series can be created using various inputs like 
- Array
- Dict
- Scalar value or constant

##### Example: A basic series, which can be created is an Empty Series

In [3]:
s = pd.Series()
print(s)

Series([], dtype: float64)


  s = pd.Series()


# Create a Series from ndarray
- If data is an ndarray, then index passed must be of the same length. 
- If no index is passed, then by default index will be range(n) where n is array length, i.e., [0,1,2,3…. range(len(array))-1]

In [19]:
import numpy as np

In [10]:
data1 = np.array(['a','b','c','d'])
s1 = pd.Series(data1)
print(s1) 

0    a
1    b
2    c
3    d
dtype: object


##### Example: Create a pandas series with indexes from 100 to 103 containing data [a,b,c,d] using pandas.Series()

In [9]:
data2 = np.array(['a','b','c','d'])
s2 = pd.Series(data2,index=[100,101,102,103])
print(s2)

100    a
101    b
102    c
103    d
dtype: object


# Create a Series from dict
- A dict can be passed as input and if no index is specified, then the dictionary keys are taken in a sorted order to construct index
- If index is passed, the values in data corresponding to the labels in the index will be pulled out

In [11]:
data3 = {'a' : 0., 'b' : 1., 'c' : 2.}
s3 = pd.Series(data3)
print(s3)

a    0.0
b    1.0
c    2.0
dtype: float64


##### Index order is persisted and the missing element is filled with NaN (Not a Number)

In [12]:
data4 = {'a' : 0., 'b' : 1., 'c' : 2.}
s4 = pd.Series(data4,index=['b','c','d','a'])
print(s4)

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64


# Create a Series from scalar
- If data is a scalar value, an index must be provided. The value will be repeated to match the length of index

In [17]:
s5 = pd.Series(6, index=[0, 1, 2, 3])
print(s5)

0    6
1    6
2    6
3    6
dtype: int64


# Accessing Data from Series with Position
- Data in the series can be accessed similar to that in an ndarray

##### Example : Retrieve the first 3 element from a series

In [19]:
s6 = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
#retrieve the first 3 elements
print (s6[:3])

a    1
b    2
c    3
dtype: int64


##### Exercise: Retrieve the last three elements

In [26]:
s7 = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
#retrieve the last three elements
print( s7[-3:])

c    3
d    4
e    5
dtype: int64


# Accessing Data from Series 
- Retrieve Data Using Label (Index)
- A Series is like a fixed-size dict in that you can get and set values by index label

##### Example :Retrieve a single element using index label value

In [21]:
s8 = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
#retrieve a single element
print( s8['a'])

1


##### Example :Retrieve multiple elements using a list of index label values

In [22]:
s9 = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
#retrieve multiple elements
print( s9[['a','c','d']])

a    1
c    3
d    4
dtype: int64


# Create an Empty DataFrame
- A basic DataFrame, which can be created is an Empty Dataframe

In [24]:
df1 = pd.DataFrame()
print( df1)

Empty DataFrame
Columns: []
Index: []


#### Example: Creating DataFrame from Lists

In [25]:
data5 = [['Alex',10],['Bob',12],['Clarke',13]]
df2 = pd.DataFrame(data5,columns=['Name','Age'])
print (df2)

     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13


##### Example : Change the datatype of age to the float

In [27]:
data6 = [['Alex',10],['Bob',12],['Clarke',13]]
df3 = pd.DataFrame(data6,columns=['Name','Age'],dtype=float)
print( df3) 

     Name   Age
0    Alex  10.0
1     Bob  12.0
2  Clarke  13.0


# Creating DataFrame from Dict of ndarrays / Lists
- All the ndarrays must be of same length.
- If index is passed, then the length of the index should equal to the length of the arrays
- If no index is passed, then by default, index will be range(n), where n is the array length

In [28]:
data7 = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df4 = pd.DataFrame(data7)
print (df4)

    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42


##### Example: Creating an indexed DataFrame from Dict of ndarrays / Lists

In [30]:
data8 = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df5 = pd.DataFrame(data8, index=['rank1','rank2','rank3','rank4'])
print( df5)

        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29
rank4  Ricky   42


# Creating a DataFrame from List of Dicts
- List of Dictionaries can be passed as input data to create a DataFrame. 
- The dictionary keys are by default taken as column names 

In [35]:
data9 = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df6 = pd.DataFrame(data9)
print( df6)

   a   b     c
0  1   2   NaN
1  5  10  20.0


##### Exercise: Create a DataFrame by passing a list of dictionaries 

In [36]:
data10 = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df7 = pd.DataFrame(data10, index=['first', 'second'])
print( df7) 

        a   b     c
first   1   2   NaN
second  5  10  20.0


##### Creating a DataFrame with a list of dictionaries, row indices, and column indices

In [38]:
data11 = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
#With two column indices, values same as dictionary keys
df8 = pd.DataFrame(data11, index=['first', 'second'], columns=['a', 'b'])
print(df8)

        a   b
first   1   2
second  5  10


##### Creating a DataFrame with a list of dictionaries, row indices, and column indices

In [40]:
data12 = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
#With two column indices with one index with other name
df9 = pd.DataFrame(data12, index=['first', 'second'], columns=['a', 'b1'])
print(df9)

        a  b1
first   1 NaN
second  5 NaN


# Creating a DataFrame from Dict of Series
- Dictionary of Series can be passed to form a DataFrame 
- The resultant index is the union of all the series indexes passed

In [42]:
d1 = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df10 = pd.DataFrame(d1)
print (df10)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


# Column Selection, addition and Deletion

##### Column Selection

In [43]:
d2 = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df11 = pd.DataFrame(d2)
print( df11 ['one'])

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64


##### Column Addition
- Adding a new column to an existing data frame

In [6]:
d3 = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df12 = pd.DataFrame(d3)
# Adding a new column to an existing DataFrame object with column label by passing new series
print ("Adding a new column by passing as Series:")
df12['three']=pd.Series([10,20,30],index=['a','b','c'])
print( df12)

Adding a new column by passing as Series:
   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


##### Exercise : Add a fourth column in the previous dataframe which is the sum of column “one” and column “three”

In [8]:
d4 = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df13 = pd.DataFrame(d4)
# Adding a new column to an existing DataFrame object with column label by passing new series
print ("Adding a new column by passing as Series:")
df13['three']=pd.Series([10,20,30],index=['a','b','c'])
print( df13)
print ("Adding a new column using the existing columns in DataFrame:")
df13['four']=df13['one']+df13['three']
print (df13)

Adding a new column by passing as Series:
   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN
Adding a new column using the existing columns in DataFrame:
   one  two  three  four
a  1.0    1   10.0  11.0
b  2.0    2   20.0  22.0
c  3.0    3   30.0  33.0
d  NaN    4    NaN   NaN


##### Column deletion
- Columns can be deleted or popped by del  or pop

In [10]:
d5={'one':pd.Series([1, 2, 3],index=['a','b','c']),'two':pd.Series([1, 2, 3, 4],index=['a','b','c','d']),'three':pd.Series([10,20,30], index=['a','b','c'])}
df13= pd.DataFrame(d5)
print ("Deleting the first column using DEL function:")
del df13['one']
print( df13)

Deleting the first column using DEL function:
   two  three
a    1   10.0
b    2   20.0
c    3   30.0
d    4    NaN


##### Selection by Label
- Rows can be selected by passing row label to a loc function

In [11]:
d6 = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df13 = pd.DataFrame(d6)
print(df13)
print (df13.loc['b'])

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
one    2.0
two    2.0
Name: b, dtype: float64


# Row Selection, Addition, and Deletion
- Selection by integer location

In [12]:
d7 = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),  'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df14 = pd.DataFrame(d7)
print( df14.iloc[2])

one    3.0
two    3.0
Name: c, dtype: float64


##### Slice row
- By operator ‘:’

In [13]:
d8 = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),  'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df15 = pd.DataFrame(d8)
print (df15[2:4])

   one  two
c  3.0    3
d  NaN    4


##### Addition of Rows
- Add new rows to a DataFrame using the append function
- This function will append the rows at the end

In [16]:
df16 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df17 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
df16 = df16.append(df17)
print (df16)

   a  b
0  1  2
1  3  4
0  5  6
1  7  8


##### Deletion of Rows
- Use index label to delete or drop rows from a DataFrame by drop()

In [17]:
df18 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df19 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
df18 = df18.append(df19)
# Drop rows with label 0
df18 = df18.drop(0)
print (df18)

   a  b
1  3  4
1  7  8


# Series Basic Functionality
- Create a Series and see all the attributes operation

In [20]:
s1 = pd.Series(np.random.randn(4))
print( s1)

0   -1.042435
1   -0.352726
2   -0.214650
3   -0.833307
dtype: float64


##### Axes
- Returns the list of the labels of the series

In [21]:
s2 = pd.Series(np.random.randn(4))
print ("The axes are:")
print (s2.axes)

The axes are:
[RangeIndex(start=0, stop=4, step=1)]


##### empty
- Returns the Boolean value saying whether the Object is empty or not 
- True indicates that the object is empty

In [22]:
s3 = pd.Series(np.random.randn(4))
print ("Is the Object empty?")
print (s3.empty)

Is the Object empty?
False


##### ndim
- Returns the number of dimensions of the object. By definition, a Series is a 1D data structure, so it returns

In [23]:
s4 = pd.Series(np.random.randn(4))
print (s4)
print ("The dimensions of the object:")
print (s4.ndim)

0   -1.737973
1   -0.651273
2   -0.526529
3    0.381810
dtype: float64
The dimensions of the object:
1


##### size
- Returns the size(length) of the series

In [24]:
s5 = pd.Series(np.random.randn(2))
print (s5)
print ("The size of the object:")
print (s5.size)

0    0.208365
1   -0.124919
dtype: float64
The size of the object:
2


##### values
- Returns the actual data in the series as an array

In [26]:
s6 = pd.Series(np.random.randn(4))
print (s6)
print ("The actual data series is:")
print (s6.values)

0   -1.085023
1    0.236476
2    1.742298
3   -0.970595
dtype: float64
The actual data series is:
[-1.08502295  0.2364759   1.7422985  -0.97059547]


##### head and tail
- To view a small sample of a Series or the DataFrame object, use the head() and the tail() methods. head() returns the first n rows(observe the index values). The default number of elements to display is five, but you may pass a custom number

In [28]:
s7 = pd.Series(np.random.randn(4))
print ("The original series is:")
print (s7)
print ("The first two rows of the data series:")
print (s7.head(2))

The original series is:
0    2.288835
1    1.388239
2    0.398697
3    0.473895
dtype: float64
The first two rows of the data series:
0    2.288835
1    1.388239
dtype: float64


##### tail() 
- returns the last n rows(observe the index values). The default number of elements to display is five, but you may pass a custom number

In [29]:
s8 = pd.Series(np.random.randn(4))
print ("The original series is:")
print (s8)
print ("The last two rows of the data series:")
print (s8.tail(2))

The original series is:
0   -0.013827
1    1.352597
2    0.214221
3   -0.668230
dtype: float64
The last two rows of the data series:
2    0.214221
3   -0.668230
dtype: float64


# Data Frame Basic Functionality
- Create a Series and see all the attributes operation

In [44]:
d9 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),'Age':pd.Series([25,26,25,23,30,29,23]),'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df20 = pd.DataFrame(d9)
print ("Our data series is:")
print (df20)

Our data series is:
    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20
5  Smith   29    4.60
6   Jack   23    3.80


##### Transpose
- Returns the transpose of the DataFrame. The rows and columns will interchange

In [31]:
# Create a Dictionary of series
d10 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),'Age':pd.Series([25,26,25,23,30,29,23]),'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
# Create a DataFrame
df21 = pd.DataFrame(d10)
print ("The transpose of the data series is:")
print (df21.T)

The transpose of the data series is:
           0      1      2     3      4      5     6
Name     Tom  James  Ricky   Vin  Steve  Smith  Jack
Age       25     26     25    23     30     29    23
Rating  4.23   3.24   3.98  2.56    3.2    4.6   3.8


##### axes
- Returns the list of row axis labels and column axis labels

In [32]:
#Create a Dictionary of series
d11 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']), 'Age':pd.Series([25,26,25,23,30,29,23]),'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
#Create a DataFrame
print ("Row axis labels and column axis labels are:")
print (df21.axes)

Row axis labels and column axis labels are:
[RangeIndex(start=0, stop=7, step=1), Index(['Name', 'Age', 'Rating'], dtype='object')]


###### dtype
- Returns the data type of each column

In [33]:
#Create a Dictionary of series
d12 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
#Create a DataFrame

df22 = pd.DataFrame(d12)
print ("The data types of each column are:")
print (df22.dtypes)

The data types of each column are:
Name       object
Age         int64
Rating    float64
dtype: object


##### empty
- Returns the Boolean value saying whether the Object is empty or not; True indicates that the object is empty

In [34]:
#Create a Dictionary of series
d13 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])} 
#Create a DataFrame
df23 = pd.DataFrame(d13)
print ("Is the object empty?")
print (df23.empty)

Is the object empty?
False


##### ndim
- Returns the number of dimensions of the object. By definition, DataFrame is a 2D object

In [35]:
#Create a Dictionary of series
d14 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
#Create a DataFrame
df24 = pd.DataFrame(d14)
print ("The dimension of the object is:")
print (df24.ndim)

The dimension of the object is:
2


##### shape
- Returns a tuple representing the dimensionality of the DataFrame. Tuple (a,b), where a represents the number of rows and b represents the number of columns

In [37]:
#Create a Dictionary of series
d15 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])} 
#Create a DataFrame
df25 = pd.DataFrame(d15)
print ("The shape of the object is:")
print (df25.shape)

The shape of the object is:
(7, 3)


##### size
- Returns the number of elements in the DataFrame 

In [38]:
#Create a Dictionary of series
d16 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])} 
#Create a DataFrame
df26 = pd.DataFrame(d16)
print ("The total number of elements in our object is:")
print (df26.size) 

The total number of elements in our object is:
21


##### values
- Returns the actual data in the DataFrame as an NDarray 

In [39]:
d17 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']), 'Age':pd.Series([25,26,25,23,30,29,23]),'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])} 
df27 = pd.DataFrame(d17)
print ("The actual data in our data frame is:")
print (df27.values) 

The actual data in our data frame is:
[['Tom' 25 4.23]
 ['James' 26 3.24]
 ['Ricky' 25 3.98]
 ['Vin' 23 2.56]
 ['Steve' 30 3.2]
 ['Smith' 29 4.6]
 ['Jack' 23 3.8]]


##### Head and tail
- To view a small sample of a DataFrame object, use the head() and tail() methods. head() returns the first n rows (observe the index values). The default number of elements to display is five, but you may pass a custom number. 

In [42]:
d18 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']), 'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
df28 = pd.DataFrame(d18)
print ("The first two rows of the data frame is:")
print (df28.head(2)) 

The first two rows of the data frame is:
    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24


##### tail() 
- returns the last n rows (observe the index values). The default number of elements to display is five, but you may pass a custom number. 

In [43]:
d19 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack']),   'Age':pd.Series([25,26,25,23,30,29,23]), 
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])} 
df29 = pd.DataFrame(d19)
print ("The last two rows of the data frame is:")
print (df29.tail(2)) 

The last two rows of the data frame is:
    Name  Age  Rating
5  Smith   29     4.6
6   Jack   23     3.8


##### Sum() 

In [2]:
import pandas as pd
#Create a Dictionary of series
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
df = pd.DataFrame(d)
print (df.sum())

Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
Age                                                     382
Rating                                                44.92
dtype: object


## Iterating Data Frame
- Iterating a DataFrame gives column names

In [25]:
#Create a Dictionary of series
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
df = pd.DataFrame(d)
for col in df:
   print (col)

Name
Age
Rating


### To iterate over the rows of the DataFrame, we can use the following functions 
- iteritems() − to iterate over the (key,value) pairs
- iterrows() − iterate over the rows as (index,series) pairs
- itertuples() − iterate over the rows as namedtuples

In [26]:
import pandas as pd
import numpy as np 
df = pd.DataFrame(np.random.randn(4,3),columns=['col1','col2','col3'])
for key,value in df.iteritems():
   print (key,value)

col1 0    1.586541
1   -1.194401
2    0.253239
3    0.994181
Name: col1, dtype: float64
col2 0    0.209431
1    0.116332
2    0.508826
3   -0.063417
Name: col2, dtype: float64
col3 0    0.386132
1    0.593167
2   -2.004189
3   -0.478599
Name: col3, dtype: float64


# Python Pandas Sorting
- There are two kinds of sorting available in Pandas. 
  - By label
  - By Actual Value

In [24]:
unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])
print( unsorted_df)

       col2      col1
1 -0.736000  0.855097
4  0.983161  1.618269
6  0.047322 -0.477611
2  0.236567  0.089014
3  0.407390  0.041386
5 -0.883855  1.118687
9 -0.545148 -1.291019
8 -0.531060  1.352921
0  1.930911  0.233044
7  0.396162  0.129276


##### By Label
- Using the sort_index() method, by passing the axis arguments and the order of sorting, DataFrame can be sorted. By default, sorting is done on row labels in ascending order

In [23]:
unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns= ['col2','col1'])
sorted_df=unsorted_df.sort_index()
print( sorted_df)

       col2      col1
0  0.444448 -0.985576
1 -0.020081  1.531466
2  0.462751 -1.948864
3 -0.331574  0.977221
4 -0.154143  0.196000
5 -1.176624 -0.606337
6  1.765999  0.519817
7 -0.488500  0.210895
8  2.051403 -1.829335
9  0.210651 -1.159827


##### By Label-order of sorting-axis=1
- By passing the axis argument with a value 0 or 1, the sorting can be done on the column labels. By default, axis=0, sort by row

In [22]:
unsorted_df = pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns = ['col2','col1'])
sorted_df=unsorted_df.sort_index(axis=1)
print (sorted_df)

       col1      col2
1 -0.958587  1.835991
4 -0.757220 -1.680717
6 -1.053352  0.217282
2 -1.010567 -0.576121
3 -1.006260 -1.047206
5 -0.314375 -1.999209
9 -0.389242 -0.555087
8 -0.465241  0.985339
0  0.533712 -0.175930
7  1.890017  1.463072


##### By Value
- sort_values() is the method for sorting by values. 
- It accepts a 'by' argument which will use the column name of the DataFrame with which the values are to be sorted

In [21]:
unsorted_df = pd.DataFrame({'col1':[2,1,1,1],'col2':[1,3,2,4]})
sorted_df = unsorted_df.sort_values(by='col1')
print (sorted_df)

   col1  col2
1     1     3
2     1     2
3     1     4
0     2     1


# Python Pandas Ranking
- Ranking Rows of Pandas DataFrame
- To rank the rows of pandas DataFrame we can use the DataFrame.rank()

In [16]:
# Define the dictionary for converting to dataframe 
movies = {'Name': ['The Godfather', 'Bird Box', 'Fight Club'],'Year': ['1972', '2018', '1999'],'Rating': ['9.2', '6.8', '8.8']}  
df = pd.DataFrame(movies)
# Create a column Rating_Rank which containsthe rank of each movie based on rating
df['Rating_Rank'] = df['Rating'].rank(ascending = 1)
print(df)

            Name  Year Rating  Rating_Rank
0  The Godfather  1972    9.2          3.0
1       Bird Box  2018    6.8          1.0
2     Fight Club  1999    8.8          2.0


In [17]:
movies = {'Name': ['The Godfather', 'Bird Box', 'Fight Club'], 'Year': ['1972', '2018', '1999'], 'Rating': ['9.2', '6.8', '8.8']}  
df = pd.DataFrame(movies)
print(df)
# Create a column Rating_Rank which contains the rank of each movie based on rating
df['Rating_Rank'] = df['Rating'].rank(ascending = 1)
# Set the index to newly created column, Rating_Rank
df = df.set_index('Rating_Rank')
print(df)

            Name  Year Rating
0  The Godfather  1972    9.2
1       Bird Box  2018    6.8
2     Fight Club  1999    8.8
                      Name  Year Rating
Rating_Rank                            
3.0          The Godfather  1972    9.2
1.0               Bird Box  2018    6.8
2.0             Fight Club  1999    8.8


In [19]:
# Define the dictionary for converting to dataframe 
movies = {'Name': ['The Godfather', 'Bird Box', 'Fight Club'], 'Year': ['1972', '2018', '1999'], 'Rating': ['9.2', '6.8', '8.8']}  
df = pd.DataFrame(movies)
print(df)
# Create a column Rating_Rank which contains the rank of each movie based on rating
df['Rating_Rank'] = df['Rating'].rank(ascending = 1)
print(df)
# Set the index to newly created column, Rating_Rank
df = df.set_index('Rating_Rank')
#print(df)
# Sort the dataFrame based on the index
df = df.sort_index()
print(df)

            Name  Year Rating
0  The Godfather  1972    9.2
1       Bird Box  2018    6.8
2     Fight Club  1999    8.8
            Name  Year Rating  Rating_Rank
0  The Godfather  1972    9.2          3.0
1       Bird Box  2018    6.8          1.0
2     Fight Club  1999    8.8          2.0
                      Name  Year Rating
Rating_Rank                            
1.0               Bird Box  2018    6.8
2.0             Fight Club  1999    8.8
3.0          The Godfather  1972    9.2


##### Exercise: Define the following data frame. Index the data frame according to the rank of marks i-e 100 has lowest rank and 40 has the maximum rank. And finally sort the data frame.

In [20]:
# Create a dictionary with student details
student_details = {'Name':['Raj', 'Raj', 'Raj', 'Aravind', 'Aravind', 'Aravind','John', 'John', 'John', 'Arjun', 'Arjun', 'Arjun'],'Subject':['Maths', 'Physics', 'Chemistry', 'Maths', 'Physics', 'Chemistry', 'Maths', 'Physics', 'Chemistry', 'Maths','Physics', 'Chemistry'], 'Marks':[80, 90, 75, 60, 40, 60, 80, 55, 100, 90, 75, 70]}  
# Convert dictionary to a DataFrame
df = pd.DataFrame(student_details)
print(df)
# Create a new column with Marks 
# ranked in descending order
df['Mark_Rank'] = df['Marks'].rank(ascending = 0)  
# Set index to newly created column 
df = df.set_index('Mark_Rank')
print(df)
# Sort the DataFrame based on the index 
df = df.sort_index()  
print(df)

       Name    Subject  Marks
0       Raj      Maths     80
1       Raj    Physics     90
2       Raj  Chemistry     75
3   Aravind      Maths     60
4   Aravind    Physics     40
5   Aravind  Chemistry     60
6      John      Maths     80
7      John    Physics     55
8      John  Chemistry    100
9     Arjun      Maths     90
10    Arjun    Physics     75
11    Arjun  Chemistry     70
              Name    Subject  Marks
Mark_Rank                           
4.5            Raj      Maths     80
2.5            Raj    Physics     90
6.5            Raj  Chemistry     75
9.5        Aravind      Maths     60
12.0       Aravind    Physics     40
9.5        Aravind  Chemistry     60
4.5           John      Maths     80
11.0          John    Physics     55
1.0           John  Chemistry    100
2.5          Arjun      Maths     90
6.5          Arjun    Physics     75
8.0          Arjun  Chemistry     70
              Name    Subject  Marks
Mark_Rank                           
1.0           John

## Pandas Missing Values
- Missing values are say NA or NAN
- Exercise: Creating dataframe with missing value

In [14]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df)

        one       two     three
a  0.678094  0.951852  0.096427
b       NaN       NaN       NaN
c -1.161153  1.413574  0.324438
d       NaN       NaN       NaN
e  0.893882 -0.835790 -0.671464
f  0.548645  0.066722  0.491822
g       NaN       NaN       NaN
h -0.526833 -0.375976 -0.423834


## Checking for the Missing values 
- To make detecting missing values easier (and across different array dtypes), Pandas provides the isnull() and notnull() functions, which are also methods on Series and DataFrame objects 

In [13]:
#Exercise: Checking missing value
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


## Calculation with Missing values
- When summing data, NA will be treated as Zero
- If the data are all NA, then the result will be NA

In [12]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)
print("The sum is")
print (df['one'].sum())

        one       two     three
a  2.976245  1.129418 -0.432520
b       NaN       NaN       NaN
c -0.555140 -0.038899 -0.247172
d       NaN       NaN       NaN
e -0.105131  2.043697  0.257180
f -0.667089 -0.478844 -1.647694
g       NaN       NaN       NaN
h -1.144641  1.404455  0.301047
The sum is
0.5042441382230931


# Cleaning / Filling Missing Data
- The fillna function can “fill in” NA values with non-null data in a couple of ways

In [10]:
#Example:Replace NaN with a Scalar Value
df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one','two', 'three'])
df = df.reindex(['a', 'b', 'c'])
print (df)
print ("NaN replaced with '0':")
print (df.fillna(0))

        one       two     three
a -0.009463  0.436881 -0.469927
b       NaN       NaN       NaN
c  0.287046 -1.038566 -2.252776
NaN replaced with '0':
        one       two     three
a -0.009463  0.436881 -0.469927
b  0.000000  0.000000  0.000000
c  0.287046 -1.038566 -2.252776


##### Fill NA Forward and Backward pad/fill
- Fill methods Forward- fillna(method='pad') or fillna(method=‘fill')

In [11]:
#Example : Fill Forward
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.fillna(method='pad')) 

        one       two     three
a  0.062043  1.899485  1.284389
b  0.062043  1.899485  1.284389
c  1.326770  1.162287 -0.917297
d  1.326770  1.162287 -0.917297
e  0.685369  0.628844 -1.761440
f -0.098265 -1.280880 -0.169476
g -0.098265 -1.280880 -0.169476
h -0.648039  0.317792 -0.271578


# Search how to fill Backward!
##### dataframe.dropna()

In [6]:
import numpy as np
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.fillna(method='backfill'))

        one       two     three
a  0.312007 -0.190491  2.097528
b  0.546525 -0.435939 -2.090485
c  0.546525 -0.435939 -2.090485
d  0.924040 -1.542240  1.481296
e  0.924040 -1.542240  1.481296
f -0.995932  0.372001 -0.079996
g  0.376159 -1.091507  1.001129
h  0.376159 -1.091507  1.001129


In [7]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)
print (df.dropna(axis=0))

        one       two     three
a -1.350882  0.222241  0.266239
b       NaN       NaN       NaN
c  0.077234  1.625944 -0.399533
d       NaN       NaN       NaN
e  0.915667  0.461914 -1.403072
f -0.405432  1.233473  0.941461
g       NaN       NaN       NaN
h -1.405862 -0.590075 -0.372723
        one       two     three
a -1.350882  0.222241  0.266239
c  0.077234  1.625944 -0.399533
e  0.915667  0.461914 -1.403072
f -0.405432  1.233473  0.941461
h -1.405862 -0.590075 -0.372723


In [8]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)
print (df.dropna(axis=1)) 

        one       two     three
a -0.096074  0.880745 -0.902994
b       NaN       NaN       NaN
c -0.071152  0.489423 -0.095122
d       NaN       NaN       NaN
e -0.627478 -0.317749 -0.375654
f  0.791859  0.288963 -0.567589
g       NaN       NaN       NaN
h -0.459622 -0.956242 -0.057263
Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


##### dataframe.replace()
- Replace a generic value with some specific value dataframe.replace()

In [3]:
import numpy as np
df = pd.DataFrame({'one':[10,20,30,40,50,2000], 'two':[1000,0,30,40,50,60]})
print (df.replace({1000:10,2000:60})) 

   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60
