# Load Required Packages

In [2]:
import pandas as pd
import numpy as np

# Convert to parquet and load the data

In [3]:
# data = pd.read_csv('all_anonymized_2015_11_2017_03.csv',low_memory=False)
# IDXData = data.reset_index()
# IDXData['index'] = 'AD-' + IDXData['index'].astype('str')
# Summary = IDXData.describe(include='all')
# IDXData.to_parquet('ParquetData.parquet')

In [4]:
Data = pd.read_parquet("ParquetData.parquet")
Data.head(2)

Unnamed: 0,index,maker,model,mileage,manufacture_year,engine_displacement,engine_power,body_type,color_slug,stk_year,transmission,door_count,seat_count,fuel_type,date_created,date_last_seen,price_eur
0,AD-0,ford,galaxy,151000.0,2011.0,2000.0,103.0,,,,man,5.0,7.0,diesel,2015-11-14 18:10:06.838319+00,2016-01-27 20:40:15.46361+00,10584.75
1,AD-1,skoda,octavia,143476.0,2012.0,2000.0,81.0,,,,man,5.0,5.0,diesel,2015-11-14 18:10:06.853411+00,2016-01-27 20:40:15.46361+00,8882.31


# Initial Cleaning

In [5]:
Data = Data[(Data["maker"].notnull()) & (Data["model"].notnull())]

In [6]:
MostImportantColumnsData = Data[
    [
        "index",
        "maker",
        "model",
        "mileage",
        "manufacture_year",
        "engine_displacement",
        "engine_power",
        "body_type",
        "stk_year",
        "transmission",
        "door_count",
        "seat_count",
        "fuel_type",
        "price_eur",
    ]
].copy()

In [7]:
ReasonAblePrice = MostImportantColumnsData[
    (MostImportantColumnsData["price_eur"] >= 500)
    & (MostImportantColumnsData["price_eur"] <= 50000)
    & (MostImportantColumnsData["price_eur"] != 1295.34)
].copy()
len(ReasonAblePrice)

1779127

In [8]:
ReasonAblePrice["Nulls"] = ReasonAblePrice.isnull().sum(axis=1)

print(len(ReasonAblePrice.columns))
print(len(ReasonAblePrice))

ReasonAblePrice.groupby("Nulls")["index"].count()

15
1779127


Nulls
0      5024
1    229749
2    879909
3    245652
4    170815
5     85705
6     70017
7     83572
8      7050
9      1634
Name: index, dtype: int64

In [9]:
LowNullData = ReasonAblePrice.fillna(value=np.nan)
LowNullData = LowNullData.dropna(thresh=6)

len(LowNullData)

1779127

In [10]:
LowNullData.head()

Unnamed: 0,index,maker,model,mileage,manufacture_year,engine_displacement,engine_power,body_type,stk_year,transmission,door_count,seat_count,fuel_type,price_eur,Nulls
0,AD-0,ford,galaxy,151000.0,2011.0,2000.0,103.0,,,man,5.0,7.0,diesel,10584.75,2
1,AD-1,skoda,octavia,143476.0,2012.0,2000.0,81.0,,,man,5.0,5.0,diesel,8882.31,2
3,AD-3,skoda,fabia,111970.0,2004.0,1200.0,47.0,,,man,5.0,5.0,gasoline,2960.77,2
4,AD-4,skoda,fabia,128886.0,2004.0,1200.0,47.0,,,man,5.0,5.0,gasoline,2738.71,2
5,AD-5,skoda,fabia,140932.0,2003.0,1200.0,40.0,,,man,5.0,5.0,gasoline,1628.42,2


# Define a cleaning function

In [42]:
def FillNa(Row: pd.Series, TargetColumn: str, ColumnsList: list, WithValues: dict):
    if pd.isna(Row[TargetColumn]):
        idx = tuple([Row[x] for x in ColumnsList])
        if idx in WithValues:
            return WithValues[idx]
        else:
            return np.nan
    else:
        return Row[TargetColumn]

# Clean stk_year

In [11]:
Cleaned_STK_Year = LowNullData.copy()
Cleaned_STK_Year["stk_year"] = Cleaned_STK_Year["stk_year"].apply(
    lambda x: x if x < 2021 else 2020 if x < 2024 else np.nan
)

# Clean body_type

In [29]:
Target = "body_type"
CleanUsing = ["maker", "model", "manufacture_year"]

In [30]:
Cleaned_Body_type = Cleaned_STK_Year.copy()
Cleaned_Body_type[Target] = Cleaned_Body_type[Target].apply(
    lambda x: x if x != "other" and not pd.isna(x) else np.nan
)

In [31]:
BT_Groups = (
    Cleaned_Body_type[Cleaned_Body_type[Target].notnull()]
    .groupby(CleanUsing)[Target]
    .agg(pd.Series.mode)
)

BT_Groups = BT_Groups.apply(lambda x: x if type(x) is str else x[0]).reset_index()
BT_Groups = BT_Groups.set_index(CleanUsing).to_dict()[Target]

In [44]:
Cleaned_Body_type[Target] = Cleaned_Body_type.apply(
    lambda x: FillNa(x, Target, CleanUsing, BT_Groups), axis=1
)

# Clean engine_displacement

Engine Displacement Minimum of 600 and max of 8500, else replace with nan

https://www.drivespark.com/four-wheelers/2016/five-smallest-displacement-engines-production-cars/articlecontent-pf50713-019057.html

https://www.caranddriver.com/features/g21784022/supersize-me-the-16-largest-displacement-engines-you-can-buy-today/?slide=16




In [53]:
Target = "engine_displacement"
CleanUsing = ["maker", "model", "manufacture_year", "transmission"]

In [54]:
CleanedEngineDisplacement = Cleaned_Body_type.copy()
CleanedEngineDisplacement[Target] = CleanedEngineDisplacement[Target].apply(
    lambda x: x if x >= 600 and x <= 8500 else np.nan
)

In [55]:
EDP_Groups = (
    CleanedEngineDisplacement[CleanedEngineDisplacement[Target].notnull()]
    .groupby(CleanUsing)[Target]
    .agg(pd.Series.median)
)

EDP_Groups = EDP_Groups.to_dict()

In [57]:
CleanedEngineDisplacement[Target] = CleanedEngineDisplacement.apply(
    lambda x: FillNa(x, Target, CleanUsing, EDP_Groups), axis=1
)

# Clean engine_power

https://en.wikipedia.org/wiki/List_of_production_cars_by_power_output

https://www.autotrader.com/oversteer/the-5-least-powerful-cars-in-2021-vs-2001

In [96]:
Target = "engine_power"
CleanUsing = ["maker", "model", "manufacture_year", "transmission"]

In [97]:
ClearnedEnginePower = CleanedEngineDisplacement.copy()
ClearnedEnginePower["engine_power"] = ClearnedEnginePower["engine_power"] * 1.34102

ClearnedEnginePower[Target] = ClearnedEnginePower[Target].apply(
    lambda x: x if x >= 65 and x <= 500 else np.nan
)

In [98]:
EPP_Groups = (
    ClearnedEnginePower[ClearnedEnginePower[Target].notnull()]
    .groupby(CleanUsing)[Target]
    .agg(pd.Series.median)
)

EPP_Groups = EPP_Groups.to_dict()

In [100]:
ClearnedEnginePower[Target] = ClearnedEnginePower.apply(
    lambda x: FillNa(x, Target, CleanUsing, EPP_Groups), axis=1
)