# Handling Missing and Duplicate Data

### Handling Missing Data

In [120]:
datapath = "./data/organizations-10000.csv"

In [121]:
import pandas as pd

In [122]:
df = pd.read_csv(datapath)
df.head()

Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,,522816eF8fdBE6d,Mckinney PLC,http://soto.com/,Sri Lanka,Synergized global system engine,1988,Dairy,3930
1,2.0,70C7FBD7e6Aa3Ea,Cunningham LLC,http://harding-duffy.com/,Namibia,Team-oriented fault-tolerant adapter,2018,Library,7871
2,3.0,428B397eA2d7290,Ruiz-Walls,http://www.atkins.biz/,Iran,Re-contextualized bifurcated moderator,2003,Hospital / Health Care,3095
3,4.0,9D234Ae8Cc51C1c,"Parrish, Osborne and Clarke",http://salazar.info/,British Indian Ocean Territory (Chagos Archipe...,Fully-configurable next generation concept,1989,Supermarkets,5422
4,5.0,6CDCcdE3D0b7b44,"Diaz, Robles and Haley",https://www.brooks-scott.net/,Botswana,Inverse intangible methodology,2013,Nanotechnology,3135


In [123]:
True + False + True + False + False

2

In [124]:
df.isna()

Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,True,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
9995,False,False,False,False,False,False,False,False,False
9996,False,False,False,False,False,False,False,False,False
9997,False,False,False,False,False,False,False,False,False
9998,False,False,False,False,False,False,False,False,False


In [125]:
df.isna().sum()

Index                  1
Organization Id        0
Name                   0
Website                0
Country                0
Description            0
Founded                0
Industry               0
Number of employees    0
dtype: int64

In [126]:
type(df['Index']), type(df[['Index']])

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [127]:
df['Index'].isna()

0        True
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Name: Index, Length: 10000, dtype: bool

In [128]:
condition = df['Index'].isna() == True

#### filtering records with NaN values in particular coulm

In [129]:
df[condition]

Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,,522816eF8fdBE6d,Mckinney PLC,http://soto.com/,Sri Lanka,Synergized global system engine,1988,Dairy,3930


In [130]:
df[condition][['Index', 'Organization Id']]

Unnamed: 0,Index,Organization Id
0,,522816eF8fdBE6d


In [131]:
df['Index'] = df['Index'].fillna(1.0)
df

Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,1.0,522816eF8fdBE6d,Mckinney PLC,http://soto.com/,Sri Lanka,Synergized global system engine,1988,Dairy,3930
1,2.0,70C7FBD7e6Aa3Ea,Cunningham LLC,http://harding-duffy.com/,Namibia,Team-oriented fault-tolerant adapter,2018,Library,7871
2,3.0,428B397eA2d7290,Ruiz-Walls,http://www.atkins.biz/,Iran,Re-contextualized bifurcated moderator,2003,Hospital / Health Care,3095
3,4.0,9D234Ae8Cc51C1c,"Parrish, Osborne and Clarke",http://salazar.info/,British Indian Ocean Territory (Chagos Archipe...,Fully-configurable next generation concept,1989,Supermarkets,5422
4,5.0,6CDCcdE3D0b7b44,"Diaz, Robles and Haley",https://www.brooks-scott.net/,Botswana,Inverse intangible methodology,2013,Nanotechnology,3135
...,...,...,...,...,...,...,...,...,...
9995,9996.0,2EE82AD1Cd045cd,"Neal, Day and Wang",https://carson.net/,San Marino,Team-oriented multimedia core,2013,Import / Export,6123
9996,9997.0,06f1568A2CaF04a,"Barrett, Rojas and Adkins",https://douglas-garza.com/,Turkmenistan,Cross-group dedicated methodology,2018,Human Resources / HR,9043
9997,9998.0,B4B92A44e0331Bc,Franklin-Ayala,http://www.torres.org/,Yemen,Polarized exuding orchestration,1983,Financial Services,8951
9998,9999.0,01D2539e270CEbd,Wolfe-Mckee,http://www.parks.com/,Togo,Balanced value-added ability,1975,Environmental Services,2505


In [132]:
df.columns

Index(['Index', 'Organization Id', 'Name', 'Website', 'Country', 'Description',
       'Founded', 'Industry', 'Number of employees'],
      dtype='object')

In [133]:
df.index

RangeIndex(start=0, stop=10000, step=1)

In [134]:
import random

random.seed(42)

# indices to replace nan
idx = random.choices(df.index, k=10)
idx

[6394, 250, 2750, 2232, 7364, 6766, 8921, 869, 4219, 297]

In [135]:
import numpy as np

In [136]:
arr = df['Number of employees'].to_numpy().astype('float')
arr, np.isnan(arr).sum()

(array([3930., 7871., 3095., ..., 8951., 2505., 4552.]), 0)

In [137]:
arr[idx] = np.nan

In [138]:
np.isnan(arr).sum()

10

In [139]:
df['Number of employees'] = arr

In [140]:
df.isna().sum()

Index                   0
Organization Id         0
Name                    0
Website                 0
Country                 0
Description             0
Founded                 0
Industry                0
Number of employees    10
dtype: int64

In [141]:
# mean, median
mean = df['Number of employees'].mean()
median = df['Number of employees'].median()
mean, median

(4960.3552552552555, 4894.0)

In [142]:
df['Number of employees'] = df['Number of employees'].fillna(mean)
df.isna().sum()

Index                  0
Organization Id        0
Name                   0
Website                0
Country                0
Description            0
Founded                0
Industry               0
Number of employees    0
dtype: int64

In [143]:
df.duplicated().sum()

0

In [144]:
df = df.drop_duplicates()

In [145]:
df.iloc[0].values

array([1.0, '522816eF8fdBE6d', 'Mckinney PLC', 'http://soto.com/',
       'Sri Lanka', 'Synergized global system engine', 1988, 'Dairy',
       3930.0], dtype=object)

In [146]:
l = [df.iloc[0].values]
df2 = pd.DataFrame(l, columns=df.columns)
df2

Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,1.0,522816eF8fdBE6d,Mckinney PLC,http://soto.com/,Sri Lanka,Synergized global system engine,1988,Dairy,3930.0


In [147]:
df3 = pd.concat([df2, df], ignore_index=True)
df3

Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,1.0,522816eF8fdBE6d,Mckinney PLC,http://soto.com/,Sri Lanka,Synergized global system engine,1988,Dairy,3930.0
1,1.0,522816eF8fdBE6d,Mckinney PLC,http://soto.com/,Sri Lanka,Synergized global system engine,1988,Dairy,3930.0
2,2.0,70C7FBD7e6Aa3Ea,Cunningham LLC,http://harding-duffy.com/,Namibia,Team-oriented fault-tolerant adapter,2018,Library,7871.0
3,3.0,428B397eA2d7290,Ruiz-Walls,http://www.atkins.biz/,Iran,Re-contextualized bifurcated moderator,2003,Hospital / Health Care,3095.0
4,4.0,9D234Ae8Cc51C1c,"Parrish, Osborne and Clarke",http://salazar.info/,British Indian Ocean Territory (Chagos Archipe...,Fully-configurable next generation concept,1989,Supermarkets,5422.0
...,...,...,...,...,...,...,...,...,...
9996,9996.0,2EE82AD1Cd045cd,"Neal, Day and Wang",https://carson.net/,San Marino,Team-oriented multimedia core,2013,Import / Export,6123.0
9997,9997.0,06f1568A2CaF04a,"Barrett, Rojas and Adkins",https://douglas-garza.com/,Turkmenistan,Cross-group dedicated methodology,2018,Human Resources / HR,9043.0
9998,9998.0,B4B92A44e0331Bc,Franklin-Ayala,http://www.torres.org/,Yemen,Polarized exuding orchestration,1983,Financial Services,8951.0
9999,9999.0,01D2539e270CEbd,Wolfe-Mckee,http://www.parks.com/,Togo,Balanced value-added ability,1975,Environmental Services,2505.0


In [148]:
df3.duplicated()

0        False
1         True
2        False
3        False
4        False
         ...  
9996     False
9997     False
9998     False
9999     False
10000    False
Length: 10001, dtype: bool

In [149]:
df3.duplicated().sum()

1

In [150]:
df3 = df3.drop_duplicates()
df3

Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,1.0,522816eF8fdBE6d,Mckinney PLC,http://soto.com/,Sri Lanka,Synergized global system engine,1988,Dairy,3930.0
2,2.0,70C7FBD7e6Aa3Ea,Cunningham LLC,http://harding-duffy.com/,Namibia,Team-oriented fault-tolerant adapter,2018,Library,7871.0
3,3.0,428B397eA2d7290,Ruiz-Walls,http://www.atkins.biz/,Iran,Re-contextualized bifurcated moderator,2003,Hospital / Health Care,3095.0
4,4.0,9D234Ae8Cc51C1c,"Parrish, Osborne and Clarke",http://salazar.info/,British Indian Ocean Territory (Chagos Archipe...,Fully-configurable next generation concept,1989,Supermarkets,5422.0
5,5.0,6CDCcdE3D0b7b44,"Diaz, Robles and Haley",https://www.brooks-scott.net/,Botswana,Inverse intangible methodology,2013,Nanotechnology,3135.0
...,...,...,...,...,...,...,...,...,...
9996,9996.0,2EE82AD1Cd045cd,"Neal, Day and Wang",https://carson.net/,San Marino,Team-oriented multimedia core,2013,Import / Export,6123.0
9997,9997.0,06f1568A2CaF04a,"Barrett, Rojas and Adkins",https://douglas-garza.com/,Turkmenistan,Cross-group dedicated methodology,2018,Human Resources / HR,9043.0
9998,9998.0,B4B92A44e0331Bc,Franklin-Ayala,http://www.torres.org/,Yemen,Polarized exuding orchestration,1983,Financial Services,8951.0
9999,9999.0,01D2539e270CEbd,Wolfe-Mckee,http://www.parks.com/,Togo,Balanced value-added ability,1975,Environmental Services,2505.0
