In [23]:
#make sure to run this block before start coding
import pandas as pd
import numpy as np
from numpy import nan as NA  #it tells weither the data is null or not
#NA can be replace by any word, here NA is decleared by numpy's constant nan(not a number)
import re    #re is the library by which we can use regular expression

# Handling Missing Data

In [16]:
#if there is no data in any column for some reason, we use data handling methods
#we will see any sort of indicator(for missing data) in our data
#for numeric data, pandas uses the floating-point value NaN(not a number) to represent data
#we can use display & print both in python to print data or to take values as output

data = pd.DataFrame([[1.1, 2.3, 3.5], [4.2, NA, NA], [5.6, 6.1, NA], [NA, NA, 8.8]],
                   columns=list('ABC'))
display(data)

data_one = pd.DataFrame([[1.1, 2.3, 3.5], [4.2, NA, NA], [5.6, 6.1, NA], [NA, NA, 8.8], [NA, NA, NA]],
                   columns=list('ABC'))
display(data_one)

data_two = pd.DataFrame([[1.1, NA, 3.5], [4.2, NA, NA], [5.6, NA, NA], [NA, NA, 8.8]],
                   columns=list('ABC'))
display(data_two)

Unnamed: 0,A,B,C
0,1.1,2.3,3.5
1,4.2,,
2,5.6,6.1,
3,,,8.8


Unnamed: 0,A,B,C
0,1.1,2.3,3.5
1,4.2,,
2,5.6,6.1,
3,,,8.8
4,,,


Unnamed: 0,A,B,C
0,1.1,,3.5
1,4.2,,
2,5.6,,
3,,,8.8


In [9]:
#dropna method will remove all NaN(NA) values from your dataset
#how='any': checked if any value is null in a particular row, if yes than eliminate the whole row
#how='all': checked if all value is null in a particular row, if yes than eliminate the whole row
cleaned_data = data.dropna(how='any')
display(cleaned_data)

Unnamed: 0,A,B,C
0,1.1,2.3,3.5


In [12]:
#how='all': checked if all value is null in a particular row/column, if yes than eliminate the whole row/column
#last row isn't print because it has all values NaN
cleaned_data_one = data_one.dropna(how='all')
display(cleaned_data_one)

Unnamed: 0,A,B,C
0,1.1,2.3,3.5
1,4.2,,
2,5.6,6.1,
3,,,8.8


In [17]:
#if we wanna delete NaN values row wise, than
cleaned_data_two = data_two.dropna(axis=1, how='all')
display(cleaned_data_two)

Unnamed: 0,A,C
0,1.1,3.5
1,4.2,
2,5.6,
3,,8.8


# Filling Missing Data

In [3]:
#iloc is used to append data on any array type variable(for any particular position)
data_frame = pd.DataFrame(np.random.randn(7, 3))
data_frame.iloc[:4, 1] = NA  #first column, starting values till before 4 is NA values
data_frame.iloc[:2, 2] = NA  #second column, starting values till before 2 is NA values
data_frame

Unnamed: 0,0,1,2
0,0.041975,,
1,1.891504,,
2,0.78817,,0.071165
3,-0.615608,,-0.410416
4,0.212109,0.816387,0.240969
5,-1.653204,-0.680144,-0.796848
6,-0.390076,-0.33023,-0.019715


In [5]:
#if you wanna add values against NA values than, use
#first way to replace NaN with some values, that is in the fillna() method
data_frame = data_frame.fillna(2)    #upto 2 mins 30 sec
display(data_frame)

Unnamed: 0,0,1,2
0,0.041975,2.0,2.0
1,1.891504,2.0,2.0
2,0.78817,2.0,0.071165
3,-0.615608,2.0,-0.410416
4,0.212109,0.816387,0.240969
5,-1.653204,-0.680144,-0.796848
6,-0.390076,-0.33023,-0.019715


In [8]:
# column number 1 will be filled with 0.5
# column number 2 will be fileld with 0
data_frame_one = pd.DataFrame(np.random.randn(7, 3))
data_frame_one.iloc[:4, 1] = NA  #first column, starting values till before 4 is NA values
data_frame_one.iloc[:2, 2] = NA  #second column, starting values till before 2 is NA values
display(data_frame_one)

#second approach is to apply condition to each or single column
data_frame_one = data_frame_one.fillna({1: 0.5, 2: 0})
display(data_frame_one)

Unnamed: 0,0,1,2
0,-0.527848,,
1,0.74976,,
2,-1.345324,,-1.658936
3,1.778973,,0.018517
4,0.772715,-0.160631,-0.21687
5,0.54304,2.050374,-0.213376
6,-0.454292,-0.055814,-0.039423


Unnamed: 0,0,1,2
0,-0.527848,0.5,0.0
1,0.74976,0.5,0.0
2,-1.345324,0.5,-1.658936
3,1.778973,0.5,0.018517
4,0.772715,-0.160631,-0.21687
5,0.54304,2.050374,-0.213376
6,-0.454292,-0.055814,-0.039423


In [13]:
data_frame_two = pd.DataFrame(np.random.randn(7, 3))
data_frame_two.iloc[:4, 1] = NA  #first column, starting values till before 4 is NA values
data_frame_two.iloc[:2, 2] = NA  #second column, starting values till before 2 is NA values
display(data_frame_two)

Unnamed: 0,0,1,2
0,-0.11252,,
1,-0.953585,,
2,0.406943,,0.534368
3,0.470035,,-0.176928
4,-1.41125,0.546477,0.276814
5,-1.157984,0.555441,0.81977
6,0.861263,1.146829,0.027525


In [15]:
# fill  not values from left column (axis=0 means from top row)
#limit = 1 means that the column one will fill
data_frame_two = data_frame_two.fillna(method='ffill', limit=1, axis=1)
display(data_frame_two)

Unnamed: 0,0,1,2
0,-0.11252,-0.11252,
1,-0.953585,-0.953585,
2,0.406943,0.406943,0.534368
3,0.470035,0.470035,-0.176928
4,-1.41125,0.546477,0.276814
5,-1.157984,0.555441,0.81977
6,0.861263,1.146829,0.027525


In [17]:
#limit = 2 means that the column one will fill
data_frame_two = data_frame_two.fillna(method='ffill', limit=2, axis=1)
display(data_frame_two)

Unnamed: 0,0,1,2
0,-0.11252,-0.11252,-0.11252
1,-0.953585,-0.953585,-0.953585
2,0.406943,0.406943,0.534368
3,0.470035,0.470035,-0.176928
4,-1.41125,0.546477,0.276814
5,-1.157984,0.555441,0.81977
6,0.861263,1.146829,0.027525


# Removing Duplicates

In [23]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],  #last row named two came from this list ['two']
                     'k2': [1, 1, 2, 3, 3, 4, 4]} )
display(data)
display(data.duplicated())  #show boolean values as duplicated

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [27]:
#is not in-place operation, we should have it into any variable
data = data.drop_duplicates()
display(data)   #remove last duplicated row, drop_duplicates() applies on whole row

#new data set to add new column by using python list
data_one = pd.DataFrame({'C1': ['one', 'two'] * 3 + ['two'],  #last row named two came from this list ['two']
                     'C2': [1, 1, 2, 3, 3, 4, 4]} )
display(data_one)

#now creating a new column
data_one['V1'] = ['one', 'two', 'one', 'four', 'one', 'six', 'two']
display(data_one)

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Unnamed: 0,C1,C2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


Unnamed: 0,C1,C2,V1
0,one,1,one
1,two,1,two
2,one,2,one
3,two,3,four
4,one,3,one
5,two,4,six
6,two,4,two


In [29]:
#if you wanna remove any duplicate value from any specific column than, use
data_one = data_one.drop_duplicates(['C1', 'V1'])
display(data_one)  #check both rows, if found duplicate than remove that row

Unnamed: 0,C1,C2,V1
0,one,1,one
1,two,1,two
3,two,3,four
5,two,4,six


# Replacing Values

In [37]:
#replacing method work with all values rather than null or NaN values
data_frame_three = pd.DataFrame(np.random.randn(7, 3))
data_frame_three.iloc[:4, 1] = np.nan  #first column, starting values till before 4 is NA values
data_frame_three.iloc[:2, 2] = np.nan  #second column, starting values till before 2 is NA values
display(data_frame_three)

Unnamed: 0,0,1,2
0,-0.632064,,
1,-0.630235,,
2,0.024515,,0.246744
3,0.195827,,-1.181428
4,-0.370476,-0.591305,-0.283959
5,-0.529513,0.726886,2.056611
6,-0.554402,-0.65138,0.86876


In [39]:
#we just used replace method to replace any value rather than NaN values
data_frame_three = data_frame_three.replace(np.nan, 99)  #values always replace in float order(by Default)
display(data_frame_three)

#and if you realize that 99 is not useful value for you, than you can again replace it using
data_frame_three = data_frame_three.replace(99, 9.925)
display(data_frame_three)

Unnamed: 0,0,1,2
0,-0.632064,99.0,99.0
1,-0.630235,99.0,99.0
2,0.024515,99.0,0.246744
3,0.195827,99.0,-1.181428
4,-0.370476,-0.591305,-0.283959
5,-0.529513,0.726886,2.056611
6,-0.554402,-0.65138,0.86876


Unnamed: 0,0,1,2
0,-0.632064,9.925,9.925
1,-0.630235,9.925,9.925
2,0.024515,9.925,0.246744
3,0.195827,9.925,-1.181428
4,-0.370476,-0.591305,-0.283959
5,-0.529513,0.726886,2.056611
6,-0.554402,-0.65138,0.86876


# Renaming Indexes (Renaming Axis Indexes using function mapping)

In [4]:
data_frame_four = pd.DataFrame(np.arange(12).reshape((3, 4)),
                              index=['Karachi', 'Lahore', 'Islamabad'],
                              columns=['one', 'two', 'three', 'four'])

data_frame_four

Unnamed: 0,one,two,three,four
Karachi,0,1,2,3
Lahore,4,5,6,7
Islamabad,8,9,10,11


In [5]:
#if we don't want to gives function any name than we use lambda
#map function is used for subtituting any value, working on dictionary type objects
transform = lambda x: x[:4].upper()   #x is any input value
data_frame_four = data_frame_four.index.map(transform) #let every value of dictionary and make those values upper case
display(data_frame_four)

Index(['KARA', 'LAHO', 'ISLA'], dtype='object')

# Detecting Filtering Outliers

In [6]:
#all our data must be in normal range, like we can simulate it or make any process on it
#we have to let it out from our process if it is out of the range
#just like age not be '0' or '1000' in this generation
#like, if we working on the data of young people, so age of 65 and 98 are also outlier data for us

data_frame_five = pd.DataFrame(np.random.randn(1000, 4))
data_frame_five.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.043733,-0.015715,0.05702,-0.00562
std,0.997729,0.961123,1.001646,1.014107
min,-3.063433,-3.215987,-3.454134,-3.227417
25%,-0.595386,-0.685273,-0.644951,-0.739135
50%,0.060637,-0.017224,0.037681,-0.00117
75%,0.683536,0.629545,0.697819,0.696213
max,3.036026,3.047811,3.418679,2.942123


In [7]:
#if we take any column from the data frame, it is basically series
col = data_frame_five[2]
display(col)

0      0.838725
1      1.688936
2     -1.060124
3      1.398858
4     -0.893010
         ...   
995    0.701000
996    0.497630
997   -1.451061
998    0.479792
999    0.402628
Name: 2, Length: 1000, dtype: float64

In [8]:
#if we wanna check that any value(absolute value) is greater than our desire value than, we use
col[np.abs(col) > 3]   #like in this condition, we required values greater than 3

85     3.002407
238    3.418679
420    3.198971
808   -3.065820
818   -3.454134
Name: 2, dtype: float64

In [10]:
#if we want our data set to be less than 3, we have to remove all values greater than 3
#To select all rows having a value exceeding 3 or –3, 
#you can use the 'any' method on a boolean DataFrame:
#if you wanna check that how many values in your whole data set is greater than 3, than use
#axis: 0 or ‘index’ to apply method by rows and 1 or ‘columns’ to apply by columns.
#any(1) means any value from the column is greater tha 3 will be display on the data frame

outlier_of_data_frame_five = data_frame_five[(np.abs(data_frame_five) > 3).any(1)]
display(len(outlier_of_data_frame_five), len(data_frame_five))
print("-------------------------------")
display(outlier_of_data_frame_five)   #its give you the whole row which have the value greater than 3
#but with absolute(means minus sign is neglected)

10

1000

-------------------------------


Unnamed: 0,0,1,2,3
85,-0.142747,-0.964096,3.002407,-1.701887
120,-0.142824,0.369187,0.203433,-3.227417
238,0.135634,-0.967474,3.418679,0.180935
420,-2.458683,-0.254582,3.198971,0.152174
484,-3.063433,0.183917,1.320268,1.124905
569,3.036026,0.607333,-0.336697,0.014419
808,-0.775228,1.693571,-3.06582,-0.431441
818,-1.530669,1.339487,-3.454134,1.627291
825,0.868679,-3.215987,1.71504,-0.796768
849,0.218403,3.047811,0.341694,1.608534


# Permutation & Random Sampling / Regular Expression & Vectorized Function

In [17]:
#permutation means we re-order our data, we can't change the data
# Permuting (randomly reordering)
data_frame_six = pd.DataFrame(np.arange(5*4).reshape((5, 4)))
print("Shape of Our Data Frame:{}".format(data_frame_six.shape))
print("---------------------")
#if you don't give column any name than pandas use numeric values by default
display(data_frame_six)

Shape of Our Data Frame:(5, 4)
---------------------


Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [19]:
#permutation rearrange the values of your data frame
sampler = np.random.permutation(4)
data_frame_six = data_frame_six[sampler]  #every time it gives new order (row wise)
data_frame_six

Unnamed: 0,2,3,0,1
0,2,3,0,1
1,6,7,4,5
2,10,11,8,9
3,14,15,12,13
4,18,19,16,17


In [22]:
#if you wanna reorder your data with given permutation(our sampler value) than, use
display(sampler)
data_frame_six.take(sampler)

array([2, 3, 0, 1])

Unnamed: 0,2,3,0,1
2,10,11,8,9
3,14,15,12,13
0,2,3,0,1
1,6,7,4,5


# - Regular Expressions

When you call re.split('\s+', text) , the regular expression is first compiled, and then its split method is called on the passed text. You can compile the regex yourself with re.compile , forming a reusable regex object:

In [26]:
#re is the library by which we can use regular expression
#simply first we use normal python split method
#reular expression firstly compile and than execute

text = "foo    bar\t baz \tqux"
text = text.split("\t")    #normaly we split data here by '\t'
text

['foo    bar', ' baz ', 'qux']

In [34]:
text_two = "foo    bar\t baz \tqux"

rgx = re.compile('\s+')
rgx.split(text_two)

['foo', 'bar', 'baz', 'qux']

# - Vectorized String Functions in pandas

In [29]:
data_frame_seven = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}  #on that case, this data isn't used
data_frame_seven = ["simpleEmail@email.com",   "simple.email@email.com",  
        "plus+symbol@email.com",   "dash-symbol@email.com",  
     "q@email.com",   
    "“unusual”@email.com",   "dash-symbol@email-dash.com",   "test@emailServer",  
  "” “@email.com",   "user@[IPv6:2001:DB8::1]",   
  "example@localhost",   "example@s.solutions",   
  "12345@email.com"]

data_frame_seven = pd.Series(data_frame_seven)
display(data_frame_seven)

0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
5            “unusual”@email.com
6     dash-symbol@email-dash.com
7               test@emailServer
8                  ” “@email.com
9        user@[IPv6:2001:DB8::1]
10             example@localhost
11           example@s.solutions
12               12345@email.com
dtype: object

In [33]:
#experational pattern for email checking, regular expression for email checking
#we can search through internet for our required regular expression
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
#findall is the command of series which checks possible combinations by using pattern
display(data_frame_seven.str.findall(pattern, flags=re.IGNORECASE))
#match used where our pattern data is matched
print("----------------------------------")
matches = data_frame_seven[data_frame_seven.str.match(pattern, flags=re.IGNORECASE)]
display(matches)

0          [(simpleEmail, email, com)]
1         [(simple.email, email, com)]
2          [(plus+symbol, email, com)]
3          [(dash-symbol, email, com)]
4                    [(q, email, com)]
5                                   []
6     [(dash-symbol, email-dash, com)]
7                                   []
8                                   []
9                                   []
10                                  []
11                [(example, s, solu)]
12               [(12345, email, com)]
dtype: object

----------------------------------


0          simpleEmail@email.com
1         simple.email@email.com
2          plus+symbol@email.com
3          dash-symbol@email.com
4                    q@email.com
6     dash-symbol@email-dash.com
11           example@s.solutions
12               12345@email.com
dtype: object

In [39]:
#Hierarchy indexing plays an important role in re-shaping data and group-based
#operation like forming a pivot table. For Example,
#you could re-arrange the data into a DataFrame using its unstack method

data_frame_eight = pd.DataFrame(np.random.randn(9),
                               index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                                     [1, 2, 3, 1, 3, 1, 2, 2, 3]])
                                
display(data_frame_eight)

Unnamed: 0,Unnamed: 1,0
a,1,0.499649
a,2,0.840347
a,3,0.042674
b,1,0.774423
b,3,-0.963297
c,1,0.750673
c,2,0.142429
d,2,0.615629
d,3,-0.897348


In [56]:
#display(data_frame_eight['b'])
#data_frame_eight.iloc[['b', 'c']]
#repeating values of 'a' is convert into multiple column

data_frame_eight_unstack = data_frame_eight.unstack()
display(data_frame_eight_unstack)

#you can also convert you unstack data into stack data again, like

data_frame_eight.unstack().stack()

Unnamed: 0_level_0,0,0,0
Unnamed: 0_level_1,1,2,3
a,0.499649,0.840347,0.042674
b,0.774423,,-0.963297
c,0.750673,0.142429,
d,,0.615629,-0.897348


Unnamed: 0,Unnamed: 1,0
a,1,0.499649
a,2,0.840347
a,3,0.042674
b,1,0.774423
b,3,-0.963297
c,1,0.750673
c,2,0.142429
d,2,0.615629
d,3,-0.897348


In [49]:
#with a dataFrame, either axis can have a hierarchical index

frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                    index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                    columns=[['Karachi', 'Karachi', 'Lahore'], ['Green', 'Red', 'Green']])

frame.index.names = ['Key1', 'Key2']
frame.columns.names = ['City:', 'color:']
display(frame)

Unnamed: 0_level_0,City:,Karachi,Karachi,Lahore
Unnamed: 0_level_1,color:,Green,Red,Green
Key1,Key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [50]:
#if you wanna display only 'Karachi' column, than
frame['Karachi']

#Be careful to distinguish the index names 'state' and 'color'
#from the row labels
#basically state and color is not part of key1 and key2

Unnamed: 0_level_0,color:,Green,Red
Key1,Key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [59]:
#replacing key1 from key2 and key2 from key1
#@@  Reordering and sorting Levels
display(frame.swaplevel('Key1', 'Key2'))   #this data is only display at that moment, can't be save here
#can save in the same frame variables 
# or another variables like
# frame = frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,City:,Karachi,Karachi,Lahore
Unnamed: 0_level_1,color:,Green,Red,Green
Key2,Key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [60]:
#sort_index , on the other hand, sorts the data using only the values in a single level.
#When swapping levels, it’s not uncommon to also use sort_index so that the result is
#lexicographically sorted by the indicated level:
frame.sort_index(level=1)

Unnamed: 0_level_0,City:,Karachi,Karachi,Lahore
Unnamed: 0_level_1,color:,Green,Red,Green
Key1,Key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [61]:
# compare the above statement output with following statement output
#sort the data in the hierarchical/organizational manner
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,City:,Karachi,Karachi,Lahore
Unnamed: 0_level_1,color:,Green,Red,Green
Key2,Key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11
