In [36]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [44]:
data = {
    'date': ['2021-12-01', '05-11-2022', '2023/12/06', '09-01-2024'],
    'country': ['USA', 'U.S.A.', 'America', 'United States'],
    'name': ['Aammar', 'Amaar', 'Hamza', 'Hazma'],
    'sales_2020': [100, 200, None, 200],
    'sales_2021': [None, 150, 300, 150]
}
# make pandas dataframe
df = pd.DataFrame(data)



# Standardize the Date Format

In [45]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['date'] = df['date'].ffill()
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Aammar,100.0,
1,2021-12-01,U.S.A.,Amaar,200.0,150.0
2,2021-12-01,America,Hamza,,300.0
3,2021-12-01,United States,Hazma,200.0,150.0


# Harmonize Name of the Country

In [46]:
country_mapping = {
    'USA': 'United States',
    'U.S.A.': 'United States',
    'America': 'United States',
    'United States': 'United States'
}
df['country'] = df['country'].replace(country_mapping)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Aammar,100.0,
1,2021-12-01,United States,Amaar,200.0,150.0
2,2021-12-01,United States,Hamza,,300.0
3,2021-12-01,United States,Hazma,200.0,150.0


# correct typographical mistakes in name

In [47]:
name_mapping = {
    'Aammar': 'Ammar',
    'Amaar': 'Ammar',
    'Hamza': 'Hamza',
    'Hazma': 'Hamza'
}
df['name'] = df['name'].replace(name_mapping)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Ammar,100.0,
1,2021-12-01,United States,Ammar,200.0,150.0
2,2021-12-01,United States,Hamza,,300.0
3,2021-12-01,United States,Hamza,200.0,150.0


# remove duplicates

In [49]:
df = df.drop_duplicates(subset="name")
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Ammar,100.0,
2,2021-12-01,United States,Hamza,,300.0


In [50]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Ammar,100.0,
2,2021-12-01,United States,Hamza,,300.0


In [53]:
data = {
    'date': ['2021-12-01', '05-11-2022', '2023/12/06', '09-01-2024'],
    'country': ['USA', 'U.S.A.', 'America', 'United States'],
    'name': ['Aammar', 'Amaar', 'Hamza', 'Hazma'],
    'sales_2020': [100, 200, None, 200],
    'sales_2021': [None, 150, 300, 150]
}
# make pandas dataframe
df = pd.DataFrame(data)
df = df.drop(df[df['sales_2021'] <= df['sales_2020']].index)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Aammar,100.0,
2,2023/12/06,America,Hamza,,300.0
