In [160]:
import pandas as pd
from faker import Faker
import random

In [161]:
fake = Faker()

Faker.seed(42)

In [162]:
data = []

for _ in range(random.randint(1000, 2000)):
    data.append({
        'Name':fake.name(),
        'Age':random.randint(18,70),
        'Email':fake.email(),
        'Phone':fake.phone_number(),
        'Address':fake.address(),
        'Salary':random.randint(20000, 150000),
        'Join_Date':fake.date_this_century(),
        'Employment_Status':random.choice(['Full-Time', 'Part-Time', 'Contract']),
        'Department':random.choice(['IT', 'Engineering', 'Finance', 'HR', 'Marketing', 'Customer Success'])
    })

print(f'Generated data has {str(len(data))} records.')

Generated data has 1309 records.


In [163]:
def gen_limit(data:list , percent: float):
    limit = round(len(data) * percent)
    return limit

In [164]:
gen_limit(data, 0.1)

131

In [165]:
for i in random.sample(range(len(data)), random.randint(gen_limit(data, 0.1), gen_limit(data, 0.15))):
    data[i]['Email'] = None

In [166]:
for i in random.sample(range(len(data)), random.randint(gen_limit(data, 0.15), gen_limit(data, 0.2))):
    data[i]['Phone'] = None

In [167]:
for i in random.sample(range(len(data)), random.randint(gen_limit(data, 0.05), gen_limit(data, 0.15))):
    data[i]['Address'] = None

In [168]:
data.extend(random.sample(data, random.randint(gen_limit(data, 0.1), gen_limit(data, 0.15))))

In [169]:
for i in random.sample(range(len(data)), random.randint(gen_limit(data, 0.01), gen_limit(data, 0.02))):
    data[i]['Salary'] = random.randint(300000, 700000)

In [170]:
df = pd.DataFrame(data)

In [171]:
df

Unnamed: 0,Name,Age,Email,Phone,Address,Salary,Join_Date,Employment_Status,Department
0,Allison Hill,26,,+1-219-560-0133,"79402 Peterson Drives Apt. 511\nDavisstad, PA ...",66735,2001-06-23,Contract,IT
1,Kimberly Dudley,19,smiller@example.net,+1-659-931-0341x316,"55341 Amanda Gardens Apt. 764\nLake Mark, WI 0...",131142,2007-10-17,Part-Time,IT
2,Renee Morales,56,clarksherri@example.net,,,78155,2015-12-04,Full-Time,Finance
3,Tricia Valencia,33,frazierdanny@example.net,001-645-514-6270x48281,,39409,2008-03-12,Contract,Customer Success
4,Theresa Miller,59,wcabrera@example.net,001-822-778-2489x63834,"33150 Brianna Avenue Apt. 031\nPort Markhaven,...",64989,2022-10-05,Contract,Marketing
...,...,...,...,...,...,...,...,...,...
1435,Michelle Price,57,rachel41@example.net,872.277.6857,"9429 Lori Meadow\nSouth Markmouth, CT 67557",74069,2018-09-24,Part-Time,Marketing
1436,Austin Johnson,60,rstevens@example.net,232.278.1952x371,"761 Rosario Stravenue Apt. 980\nCastilloton, L...",50536,2001-05-10,Full-Time,Engineering
1437,Ryan Johnson,27,michelle57@example.com,+1-948-872-8299,"7831 Parker Loop Suite 240\nNew Lisaberg, MI 3...",20086,2015-12-18,Full-Time,Marketing
1438,Blake Watson,59,nicole39@example.org,548-490-9322x533,"51025 Cooper Circle\nNew Nicolebury, MA 37683",101897,2011-03-07,Full-Time,Customer Success


In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               1440 non-null   object
 1   Age                1440 non-null   int64 
 2   Email              1288 non-null   object
 3   Phone              1203 non-null   object
 4   Address            1262 non-null   object
 5   Salary             1440 non-null   int64 
 6   Join_Date          1440 non-null   object
 7   Employment_Status  1440 non-null   object
 8   Department         1440 non-null   object
dtypes: int64(2), object(7)
memory usage: 101.4+ KB


In [173]:
df.head()

Unnamed: 0,Name,Age,Email,Phone,Address,Salary,Join_Date,Employment_Status,Department
0,Allison Hill,26,,+1-219-560-0133,"79402 Peterson Drives Apt. 511\nDavisstad, PA ...",66735,2001-06-23,Contract,IT
1,Kimberly Dudley,19,smiller@example.net,+1-659-931-0341x316,"55341 Amanda Gardens Apt. 764\nLake Mark, WI 0...",131142,2007-10-17,Part-Time,IT
2,Renee Morales,56,clarksherri@example.net,,,78155,2015-12-04,Full-Time,Finance
3,Tricia Valencia,33,frazierdanny@example.net,001-645-514-6270x48281,,39409,2008-03-12,Contract,Customer Success
4,Theresa Miller,59,wcabrera@example.net,001-822-778-2489x63834,"33150 Brianna Avenue Apt. 031\nPort Markhaven,...",64989,2022-10-05,Contract,Marketing


In [178]:
duplicates = df.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')

df.drop_duplicates(inplace=True)

Number of duplicate rows: 0


In [184]:
missing_values = df.isna().sum()
print('Number of missing values:')
print(missing_values)

Number of missing values:
Name                   0
Age                    0
Email                136
Phone                218
Address              159
Salary                 0
Join_Date              0
Employment_Status      0
Department             0
dtype: int64


In [199]:
df['Email'].fillna('unknown', inplace=True)
df['Phone'].fillna('unknown', inplace=True)
df['Address'].fillna('unknown', inplace=True)

In [None]:
for i in random.sample(range(len(data)), random.randint(gen_limit(data, 0.1), gen_limit(data, 0.15))):
    data[i]['Email'] = None

In [190]:
df['Join_Date'] = pd.to_datetime(df['Join_Date'])
print('Join_Date column after conversion:')
print(df['Join_Date'].head())

Join_Date column after conversion:
0   2001-06-23
1   2007-10-17
2   2015-12-04
3   2008-03-12
4   2022-10-05
Name: Join_Date, dtype: datetime64[ns]


In [211]:
df['Years_Employed'] = pd.Timestamp.now().year - df['Join_Date'].dt.year
print('New Column "Years_Employed":')
print(df[['Join_Date', 'Years_Employed']].head())

New Column "Years_Employed":
   Join_Date  Years_Employed
0 2001-06-23              23
1 2007-10-17              17
2 2015-12-04               9
3 2008-03-12              16
4 2022-10-05               2


In [213]:
df['Address'] = df['Address'].str.replace('\n', ' ', regex=False)
print('Adress after text cleaning:')
print(df['Address'].head())

Adress after text cleaning:
0    79402 Peterson Drives Apt. 511 Davisstad, PA 3...
1    55341 Amanda Gardens Apt. 764 Lake Mark, WI 07832
2                                              unknown
3                                              unknown
4    33150 Brianna Avenue Apt. 031 Port Markhaven, ...
Name: Address, dtype: object


In [230]:
z_scores = (df['Salary'] - df['Salary'].mean()) / df['Salary'].std()
outliers = df[abs(z_scores) > 3]
print('Outliers based on salary:')
print(df[['Name', 'Salary']].head())

Outliers based on salary:
              Name  Salary
0     Allison Hill   66735
1  Kimberly Dudley  131142
2    Renee Morales   78155
3  Tricia Valencia   39409
4   Theresa Miller   64989


In [231]:
z_scores

0      -0.385029
1       0.601276
2      -0.210147
3      -0.803489
4      -0.411767
          ...   
1304    0.543727
1305   -0.681455
1306    0.269613
1307    0.621092
1308    0.231896
Name: Salary, Length: 1309, dtype: float64