# Outlier Analysis

In [1]:
import pandas as pd
scout_car = pd.read_csv('Step 2 - Nulls filled v1.csv')
scout_car

Unnamed: 0,make_model,Body Type,Price,VAT,KM,Registration (YYYYMM),Horsepower (kW),Type,Previous Owners,Next Inspection (YYYYMM),...,Lane departure warning system,Night view assist,Passenger-side airbag,Power steering,Rear airbag,Side airbag,Tire pressure monitoring system,Traction control,Traffic sign recognition,Xenon headlights
0,Audi A1,Sedans,15770,VAT deductible,56013.0,201601.0,66.0,Used,2,202106.0,...,0,0,1,1,0,1,1,1,0,1
1,Audi A1,Sedans,14500,Price negotiable,80000.0,201703.0,141.0,Used,0,,...,0,0,1,1,0,1,1,1,0,1
2,Audi A1,Sedans,14640,VAT deductible,83450.0,201602.0,85.0,Used,1,,...,0,0,1,1,0,1,1,1,0,0
3,Audi A1,Sedans,14500,VAT deductible,73000.0,201608.0,66.0,Used,1,,...,0,0,1,1,0,1,1,0,0,0
4,Audi A1,Sedans,16790,VAT deductible,16200.0,201605.0,66.0,Used,1,,...,0,0,1,1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15914,Renault Espace,Van,39950,VAT deductible,21213.5,201711.0,147.0,New,0,,...,1,0,1,1,1,1,1,1,1,0
15915,Renault Espace,Van,39885,VAT deductible,9900.0,201901.0,165.0,Used,1,202201.0,...,1,0,1,1,0,1,1,1,1,0
15916,Renault Espace,Van,39875,VAT deductible,15.0,201903.0,146.0,Pre-registered,1,,...,1,0,1,1,0,1,0,1,1,0
15917,Renault Espace,Van,39700,VAT deductible,10.0,201906.0,147.0,Pre-registered,0,,...,0,0,1,1,0,1,1,0,1,0


# Ensure all dtypes are what we want.

In [2]:
scout_car.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 131 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    make_model                             object 
 1    Body Type                              object 
 2    Price                                  int64  
 3    VAT                                    object 
 4    KM                                     float64
 5    Registration (YYYYMM)                  float64
 6    Horsepower (kW)                        float64
 7    Type                                   object 
 8    Previous Owners                        int64  
 9    Next Inspection (YYYYMM)               float64
 10   Inspection new                         int64  
 11   Warranty (months)                      float64
 12   Make                                   object 
 13   Model                                  object 
 14   First Registration                  

Nothing is an object that shouldn't be. That's a good basis.

In [3]:
def detect_outliers_iqr(df):
    outlier_info = {}
    for col in df.select_dtypes(include=['number']).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Get outlier indices
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
        outlier_info[col] = {
            "num_outliers": len(outliers),
            "pct_outliers": round(len(outliers) / len(df) * 100, 2),
            "outlier_indices": outliers.index.tolist()
        }
    return pd.DataFrame(outlier_info).T


In [4]:
outlier_summary = detect_outliers_iqr(scout_car)
outlier_summary

Unnamed: 0,num_outliers,pct_outliers,outlier_indices
Price,479,3.01,"[713, 732, 3587, 3589, 3590, 3591, 3592, 3593,..."
KM,847,5.32,"[25, 34, 39, 43, 53, 61, 63, 64, 310, 328, 333..."
Registration (YYYYMM),0,0.0,[]
Horsepower (kW),428,2.69,"[3353, 3587, 3590, 3592, 3593, 3594, 3595, 359..."
Previous Owners,19,0.12,"[679, 972, 1290, 2729, 3269, 6103, 6406, 6669,..."
...,...,...,...
Side airbag,2716,17.06,"[9, 18, 28, 30, 50, 53, 66, 67, 69, 76, 115, 1..."
Tire pressure monitoring system,0,0.0,[]
Traction control,0,0.0,[]
Traffic sign recognition,1976,12.41,"[165, 446, 447, 506, 597, 602, 737, 1280, 1648..."


# Let's have a look at these outliers.

# Get Dummies Here?

In [5]:
# Identify non-numeric columns
non_numerics = scout_car.select_dtypes(exclude=['number']).columns

# Apply get_dummies to non-numeric columns
dummy_df = pd.get_dummies(scout_car[non_numerics])

# Convert bools to int
dummy_df = pd.get_dummies(scout_car[non_numerics])
dummy_df = dummy_df.astype(int)

# Select numeric columns from the original DataFrame
numerics_df = scout_car.select_dtypes(include=['number'])

scout_car_with_dummies = pd.concat([numerics_df, dummy_df], axis=1)

In [6]:
scout_car_with_dummies

Unnamed: 0,Price,KM,Registration (YYYYMM),Horsepower (kW),Previous Owners,Next Inspection (YYYYMM),Inspection new,Warranty (months),First Registration,# of Doors,...,Upholstery Color_Beige,Upholstery Color_Black,Upholstery Color_Blue,Upholstery Color_Brown,Upholstery Color_Grey,Upholstery Color_Orange,Upholstery Color_Other,Upholstery Color_Red,Upholstery Color_White,Upholstery Color_Yellow
0,15770,56013.0,201601.0,66.0,2,202106.0,1,12.0,2016.0,5.0,...,0,1,0,0,0,0,0,0,0,0
1,14500,80000.0,201703.0,141.0,0,,0,12.0,2017.0,3.0,...,0,0,0,0,1,0,0,0,0,0
2,14640,83450.0,201602.0,85.0,1,,0,12.0,2016.0,4.0,...,0,1,0,0,0,0,0,0,0,0
3,14500,73000.0,201608.0,66.0,1,,0,12.0,2016.0,3.0,...,0,1,0,0,0,0,0,0,0,0
4,16790,16200.0,201605.0,66.0,1,,1,12.0,2016.0,5.0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15914,39950,21213.5,201711.0,147.0,0,,0,16.0,2017.0,5.0,...,0,1,0,0,0,0,0,0,0,0
15915,39885,9900.0,201901.0,165.0,1,202201.0,0,16.0,2019.0,5.0,...,0,1,0,0,0,0,0,0,0,0
15916,39875,15.0,201903.0,146.0,1,,1,16.0,2019.0,5.0,...,0,1,0,0,0,0,0,0,0,0
15917,39700,10.0,201906.0,147.0,0,,0,16.0,2019.0,5.0,...,0,1,0,0,0,0,0,0,0,0
