In [123]:
person = {
    "first": "Nathan",
    "last": "Schafer",
    "email": "madmax@abc.com"
}


In [124]:
people = {
    "first": ["Corey"],
    "last": ["Schafer"],
    "email": ["nathanbabu@baooo.com"]
}

In [125]:
people = {
    "first": ["Nathan", "Babo", "Simon"],
    "last": ["Schafer", "Namboori", "postman"],
    "email": ["nathan@baboo.com", "baboo@postman.com", "namboori@yes.com"]
}

In [126]:
import pandas as pd

In [127]:

people_df = pd.DataFrame(people)

In [128]:
people_df

Unnamed: 0,first,last,email
0,Nathan,Schafer,nathan@baboo.com
1,Babo,Namboori,baboo@postman.com
2,Simon,postman,namboori@yes.com


In [129]:
people_df['email']

0     nathan@baboo.com
1    baboo@postman.com
2     namboori@yes.com
Name: email, dtype: object

In [130]:
email = people_df['email']

In [131]:
type(email)

pandas.core.series.Series

A series is similar to a list of data. One dimensional array. Rows of a single column  
A dataframe is a container for multiple series object  
we can use either square brackets or dot(.) notation to select individual columns


In [132]:
people_df[['last', 'email']]

Unnamed: 0,last,email
0,Schafer,nathan@baboo.com
1,Namboori,baboo@postman.com
2,postman,namboori@yes.com


In [133]:
type(people_df[['last', 'email']])

pandas.core.frame.DataFrame

In [134]:
# to get a list of all the columns
people_df.columns

Index(['first', 'last', 'email'], dtype='object')

In [135]:
# to get a particlular row, we use the loc and iloc indexers
# iloc allows us to access rows by integer locations
people_df.iloc[0]

first              Nathan
last              Schafer
email    nathan@baboo.com
Name: 0, dtype: object

In [136]:
people_df.iloc[0]["first"] =="Nathan"

True

In [137]:
# multiple rows
people_df.iloc[[0, 1]]

Unnamed: 0,first,last,email
0,Nathan,Schafer,nathan@baboo.com
1,Babo,Namboori,baboo@postman.com


In [138]:
# to get the specific columns we want
people_df.iloc[[0, 1], 2]

0     nathan@baboo.com
1    baboo@postman.com
Name: email, dtype: object

In [139]:
# when we use loc, we are  searching by the label
people_df.loc[0]

first              Nathan
last              Schafer
email    nathan@baboo.com
Name: 0, dtype: object

In [140]:
#multiple rows
people_df.loc[[0, 1]]

Unnamed: 0,first,last,email
0,Nathan,Schafer,nathan@baboo.com
1,Babo,Namboori,baboo@postman.com


In [141]:
# specified column using loc
people_df.loc[[0, 1], 'email']

0     nathan@baboo.com
1    baboo@postman.com
Name: email, dtype: object

In [142]:
# pass list of columns with loc
people_df.loc[[0, 1], ['email', 'last']]

Unnamed: 0,email,last
0,nathan@baboo.com,Schafer
1,baboo@postman.com,Namboori


In [143]:
people_df


Unnamed: 0,first,last,email
0,Nathan,Schafer,nathan@baboo.com
1,Babo,Namboori,baboo@postman.com
2,Simon,postman,namboori@yes.com


In [144]:
# columns without name on the left is the index, default is integer starting at 0
#usually unique
# inplace = true modifies the data frame and removes the default index column shown on the left side
people_df.set_index('email', inplace = True)

In [145]:
people_df


Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
nathan@baboo.com,Nathan,Schafer
baboo@postman.com,Babo,Namboori
namboori@yes.com,Simon,postman


In [146]:
people_df.index

Index(['nathan@baboo.com', 'baboo@postman.com', 'namboori@yes.com'], dtype='object', name='email')

In [147]:
# now we have created these index, we can use df.loc to pass email id as a label
people_df.loc['nathan@baboo.com']


first     Nathan
last     Schafer
Name: nathan@baboo.com, dtype: object

In [148]:
# to get value of a particular column
people_df.loc['nathan@baboo.com', 'first']

'Nathan'

In [149]:
# to get the integer location as given below, we can still use the iloc method
people_df.iloc[0]

first     Nathan
last     Schafer
Name: nathan@baboo.com, dtype: object

In [150]:
# to reset the index
people_df.reset_index(inplace = True)
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,namboori@yes.com,Simon,postman


In [151]:
people_df.columns


Index(['email', 'first', 'last'], dtype='object')

In [152]:
people_df['last'] == 'postman'
# The above is a filter mask and will give you all the rows that meet that filter criteria
# assign the filter to a variable
filt = (people_df['last'] == 'postman')


In [153]:
# apply filter to dataframe
people_df[filt]

Unnamed: 0,email,first,last
2,namboori@yes.com,Simon,postman


In [154]:
# Another way to filter
people_df.loc[filt]

Unnamed: 0,email,first,last
2,namboori@yes.com,Simon,postman


In [155]:
people_df.loc[filt, ['first','email']]
# first value in loc is the rows that we want and the 2nd value is the columns that we want

Unnamed: 0,first,email
2,Simon,namboori@yes.com


In [156]:
#AND AND OR filters specific to pandas & for AND, | for OR
filt = (people_df['last'] == 'postman') & (people_df['first'] == 'Simon')

In [157]:
people_df.loc[filt, 'email']

2    namboori@yes.com
Name: email, dtype: object

In [158]:
filt = (people_df['last'] == 'postman') | (people_df['first'] == 'Babo')
people_df.loc[filt, 'email']

1    baboo@postman.com
2     namboori@yes.com
Name: email, dtype: object

In [159]:
# to filter based on the opposite of filter condition
people_df.loc[~filt, 'email'] # truth table

0    nathan@baboo.com
Name: email, dtype: object

In [160]:
#update data with filters
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,namboori@yes.com,Simon,postman


In [161]:
people_df.columns = ['email', 'first_name', 'last_name']

In [162]:
people_df

Unnamed: 0,email,first_name,last_name
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,namboori@yes.com,Simon,postman


In [163]:
# to convert all the column names to uppercase
people_df.columns = [x.upper() for x in people_df.columns]

In [164]:
people_df

Unnamed: 0,EMAIL,FIRST_NAME,LAST_NAME
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,namboori@yes.com,Simon,postman


In [165]:
# replace spaces with underscore
people_df.columns = people_df.columns.str.replace(' ', '_')
# to lower case
people_df.columns = [x.lower() for x in people_df.columns]
people_df

Unnamed: 0,email,first_name,last_name
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,namboori@yes.com,Simon,postman


In [166]:
# if we want to change only some columns
people_df.rename(columns = {'first_name': 'first', 'last_name': 'last'}, inplace = True)
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,namboori@yes.com,Simon,postman


In [171]:
#update data values
# use loc for getting the row
people_df.loc[2] = ["JohnSmith@email.com", "John", "Smith"]

In [172]:
people_df


Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,JohnSmith@email.com,John,Smith


In [173]:
#change a value for a single column
people_df.loc[2, ['last', 'email']] = ['Doe', 'JohnDoe@email.com']

In [174]:
 people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,JohnDoe@email.com,John,Doe


In [176]:
# only a single column value
people_df.loc[2, 'last'] = 'Smith'

In [177]:
people_df.at[2, 'last'] = 'Doe'

In [178]:
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,JohnDoe@email.com,John,Doe


In [183]:
# change valuw without using indexers
# when there is a large dataset 
filt = (people_df['email'] == 'JohnDoe@email.com')
people_df[filt]['last'] = 'smith' 
# the above method doesnot work "returning a view vs copy"

In [184]:
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,JohnDoe@email.com,John,Doe


In [186]:
# update multiple rows
people_df['email'].str.lower()

0     nathan@baboo.com
1    baboo@postman.com
2    johndoe@email.com
Name: email, dtype: object

In [187]:
# to assign the value to the dataframe
people_df['email'] =people_df['email'].str.lower()

In [188]:
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,johndoe@email.com,John,Doe


In [None]:
# there are four methods to replace - apply, map, applymap, replace


In [None]:
#Apply is used for calling a function on our values
# apply can work on either dataframe or series object


In [190]:
#apply on series
people_df['email'].apply(len)

0    16
1    17
2    17
Name: email, dtype: int64

In [191]:
def update_email(email):
    return email.upper()

In [192]:
people_df['email'].apply(update_email)

0     NATHAN@BABOO.COM
1    BABOO@POSTMAN.COM
2    JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [193]:
people_df['email'] = people_df['email'].apply(update_email)

In [194]:
people_df

Unnamed: 0,email,first,last
0,NATHAN@BABOO.COM,Nathan,Schafer
1,BABOO@POSTMAN.COM,Babo,Namboori
2,JOHNDOE@EMAIL.COM,John,Doe


In [195]:
# we can use lambda function to do the same thing
people_df['email'] = people_df['email'].apply(lambda x: x.lower())

In [196]:
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,johndoe@email.com,John,Doe


In [197]:
# apply on dataframe
people_df['email'].apply(len)

0    16
1    17
2    17
Name: email, dtype: int64

In [198]:
people_df.apply(len)
# this is applying the length function to each series in the dataframe
#specifically the columns

email    3
first    3
last     3
dtype: int64

In [200]:
len(people_df['email'])

3

In [201]:
# to count row wise
people_df.apply(len, axis = 'columns')

0    3
1    3
2    3
dtype: int64

In [202]:
#we want to use functions to be used on a series object, when using apply on an entire dataframe
# minimum values of each column
people_df.apply(pd.Series.min)

email    baboo@postman.com
first                 Babo
last                   Doe
dtype: object

In [205]:
people_df.apply(lambda x: x.min())

email    baboo@postman.com
first                 Babo
last                   Doe
dtype: object

In [206]:
# for example square root for new numerical methods can also be used
#apply a function to every record in the dataframe 
# for this we use applymap
#applymap works only on dataframes
people_df.applymap(len)

Unnamed: 0,email,first,last
0,16,6,7
1,17,4,8
2,17,4,3


In [207]:
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,johndoe@email.com,John,Doe


In [209]:
# map method only works in series
people_df['first'].map({'Babo': 'Chris', 'John': 'Hoe'})

0      NaN
1    Chris
2      Hoe
Name: first, dtype: object

In [210]:
#replace doesnot change values to NaN
people_df['first'].replace({'Babo': 'Chris', 'John': 'Hoe'})

0    Nathan
1     Chris
2       Hoe
Name: first, dtype: object

In [211]:
# ADD/REMOVe rows and columns from dataframes
people_df

Unnamed: 0,email,first,last
0,nathan@baboo.com,Nathan,Schafer
1,baboo@postman.com,Babo,Namboori
2,johndoe@email.com,John,Doe


In [213]:
people_df['first'] + ' ' + people_df['last']

0    Nathan Schafer
1     Babo Namboori
2          John Doe
dtype: object

In [214]:
people_df['full_name'] = people_df['first'] + ' ' + people_df['last']

In [215]:
people_df

Unnamed: 0,email,first,last,full_name
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer
1,baboo@postman.com,Babo,Namboori,Babo Namboori
2,johndoe@email.com,John,Doe,John Doe


In [218]:
people_df.drop(columns = ['first', 'last'])
people_df
#DOESNT WORK possibly because inplace = True is not passed


Unnamed: 0,email,first,last,full_name
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer
1,baboo@postman.com,Babo,Namboori,Babo Namboori
2,johndoe@email.com,John,Doe,John Doe


In [220]:
people_df['email'].str.split('@')
# result is the 2 values in  a list

0     [nathan, baboo.com]
1    [baboo, postman.com]
2    [johndoe, email.com]
Name: email, dtype: object

In [221]:
# to assign this to two columns
people_df['email'].str.split('@', expand = True)

Unnamed: 0,0,1
0,nathan,baboo.com
1,baboo,postman.com
2,johndoe,email.com


In [223]:
people_df[['username', 'domain']] = people_df['email'].str.split('@', expand = True)

In [224]:
people_df


Unnamed: 0,email,first,last,full_name,username,domain
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com


In [225]:
## adding and removing columns
# add a single row to a dataframe
people_df.append({'first': 'Tony'}, ignore_index= True)

Unnamed: 0,email,first,last,full_name,username,domain
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com
3,,Tony,,,,


In [226]:
people_df

Unnamed: 0,email,first,last,full_name,username,domain
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com


In [227]:
people = {
    "first": ["Tony", "Steve"],
    "last": ["Stark", "Rogers"],
    "email": ["tonys@baboo.com", "steve@gmail.com"]
}

In [228]:
df2 = pd.DataFrame(people)

In [230]:
df2

Unnamed: 0,first,last,email
0,Tony,Stark,tonys@baboo.com
1,Steve,Rogers,steve@gmail.com


In [236]:
people_df = people_df.append(df2, ignore_index=True)

In [237]:
people_df

Unnamed: 0,email,first,last,full_name,username,domain
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com
3,tonys@baboo.com,Tony,Stark,,,
4,steve@gmail.com,Steve,Rogers,,,


In [238]:
people_df.drop(index=4)

Unnamed: 0,email,first,last,full_name,username,domain
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com
3,tonys@baboo.com,Tony,Stark,,,


In [239]:
people_df

Unnamed: 0,email,first,last,full_name,username,domain
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com
3,tonys@baboo.com,Tony,Stark,,,
4,steve@gmail.com,Steve,Rogers,,,


In [243]:
filt = people_df['last'] == 'Doe'
people_df.drop(index= people_df[filt].index)


Unnamed: 0,email,first,last,full_name,username,domain
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
3,tonys@baboo.com,Tony,Stark,,,
4,steve@gmail.com,Steve,Rogers,,,


In [244]:
# sorting data
# sort by last name
people_df.sort_values(by='last')

Unnamed: 0,email,first,last,full_name,username,domain
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
4,steve@gmail.com,Steve,Rogers,,,
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
3,tonys@baboo.com,Tony,Stark,,,


In [245]:
people_df.sort_values(by='last', ascending = False) 

Unnamed: 0,email,first,last,full_name,username,domain
3,tonys@baboo.com,Tony,Stark,,,
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
4,steve@gmail.com,Steve,Rogers,,,
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com


In [246]:
# sort multiple columns
people_df.sort_values(by = ['last', 'first'], ascending = False)

Unnamed: 0,email,first,last,full_name,username,domain
3,tonys@baboo.com,Tony,Stark,,,
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
4,steve@gmail.com,Steve,Rogers,,,
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com


In [247]:
# sort by last name descending, first name ascending
people_df.sort_values(by = ['last', 'first'], ascending = [False, True])
# add inplace = true if you want to keep the changes

Unnamed: 0,email,first,last,full_name,username,domain
3,tonys@baboo.com,Tony,Stark,,,
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
4,steve@gmail.com,Steve,Rogers,,,
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com


In [249]:
# SORT BY INDEX
people_df.sort_index()

Unnamed: 0,email,first,last,full_name,username,domain
0,nathan@baboo.com,Nathan,Schafer,Nathan Schafer,nathan,baboo.com
1,baboo@postman.com,Babo,Namboori,Babo Namboori,baboo,postman.com
2,johndoe@email.com,John,Doe,John Doe,johndoe,email.com
3,tonys@baboo.com,Tony,Stark,,,
4,steve@gmail.com,Steve,Rogers,,,


In [251]:
# sort a column
people_df['last'].sort_values()

2         Doe
1    Namboori
4      Rogers
0     Schafer
3       Stark
Name: last, dtype: object

In [253]:
import numpy as np
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [255]:
np.nan == np.nan

False

In [257]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [258]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [259]:
df.dropna(axis='index', how='all')  #all row values have to be missing  for a row to be removed

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [262]:
df.dropna(axis='columns', how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [265]:
df.dropna(axis='index', how='all', subset=['last','email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [268]:
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace = True)

In [269]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [270]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [271]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [273]:
df['last'].fillna(0)

0    Schafer
1        Doe
2        Doe
3    Schafer
4          0
5          0
6          0
Name: last, dtype: object

In [274]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [277]:
df['age']= df['age'].astype(float)
# NaN value is actually a float under the hood


In [278]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [279]:
df['age'].mean()

46.75

In [None]:
print("hello
")