In [121]:
import pandas as pd

url = "https://raw.githubusercontent.com/edlich/eternalrepo/master/DS-WAHLFACH/dsm-beuth-edl-demodata-dirty.csv"

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

df = pd.read_csv(url)

#the dataframe
df

Unnamed: 0,id,full_name,first_name,last_name,email,gender,age
0,1.0,Mariel Finnigan,Mariel,Finnigan,mfinnigan0@usda.gov,Female,60
1,2.0,Kenyon Possek,Kenyon,Possek,kpossek1@ucoz.com,Male,12
2,3.0,Lalo Manifould,Lalo,Manifould,lmanifould2@pbs.org,Male,26
3,4.0,Nickola Carous,Nickola,Carous,ncarous3@phoca.cz,Male,4
4,5.0,Norman Dubbin,Norman,Dubbin,ndubbin4@wikipedia.org,Male,17
5,6.0,Hasty Perdue,Hasty,Perdue,hperdue5@qq.com,,77
6,7.0,Franz Castello,Franz,Castello,fcastello6@1688.com,Male,25
7,8.0,Jorge Tarney,Jorge,Tarney,jtarney7@ft.com,Male,77
8,9.0,Eunice Blakebrough,Eunice,Blakebrough,eblakebrough8@sohu.com,Female,45
9,10.0,Kristopher Frankcombe,Kristopher,Frankcombe,kfrankcombe9@slate.com,Male,old


In [123]:
#Preparation: check the column types
df.dtypes

#type correction: convert column 'age' to numeric type
df['age'] = df['age'].apply(pd.to_numeric, errors='coerce')

#display float as int
pd.options.display.float_format = '{:,.0f}'.format

#before dropping rows we safe a copy, so we can compare both dataframes at the end
df2 = df

In [124]:
#first, we drop the rows without any infos (17 and 22)
df2 = df2.dropna(how='all')


In [128]:
#If we take a closer look at the ages, we see that there is a negative int and one field that says "old"
#I assume that little children wouldn't yet have an email address. This implies the age indication of under 10 is wrong
#We want to drop all rows, where the age is not between 10 and 99 years

#drop rows if age is not between 10 and 99

df2 = df2[df2['age'].between(10, 99)]


In [127]:
#we drop the duplicates by comparing the full name, email, gender and age (full name and email might also be sufficient)
df2 = df2.drop_duplicates(subset=['full_name', 'email', 'gender', 'age'], keep='first')


In [126]:
#we are getting rid of the rows that didn't insert an email address
#we have no use for this person's info, if we cannot contact him/her
#BUT: we're keeping the data from those who didn't input their gender. might be the formulars fault, that it didn't have the option "diverse" 

df2 = df2.dropna(how='any',subset=['email'])


In [129]:
#we have a rare case where one id is missing, in this case we're filling it in by hand

df2['id'] = df2['id'].fillna(value=21)

In [130]:
#the clean dataframe 
df2

Unnamed: 0,id,full_name,first_name,last_name,email,gender,age
0,1,Mariel Finnigan,Mariel,Finnigan,mfinnigan0@usda.gov,Female,60
1,2,Kenyon Possek,Kenyon,Possek,kpossek1@ucoz.com,Male,12
2,3,Lalo Manifould,Lalo,Manifould,lmanifould2@pbs.org,Male,26
4,5,Norman Dubbin,Norman,Dubbin,ndubbin4@wikipedia.org,Male,17
5,6,Hasty Perdue,Hasty,Perdue,hperdue5@qq.com,,77
6,7,Franz Castello,Franz,Castello,fcastello6@1688.com,Male,25
7,8,Jorge Tarney,Jorge,Tarney,jtarney7@ft.com,Male,77
8,9,Eunice Blakebrough,Eunice,Blakebrough,eblakebrough8@sohu.com,Female,45
11,12,Luz Lansdowne,Luz,Lansdowne,llansdowneb@theguardian.com,Female,16
12,13,Modestia Keble,Modestia,Keble,mkeblec@cmu.edu,Female,91
