## IMPORTING NECESSARY LIBRARIES

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

## LOAD THE DATASET

In [9]:
df = pd.read_csv("DATASET FOR ASSIGNMENT # 1.csv")
df.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,...,engine-type,num-of-cylinders,engine-size,fuel-system,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,dohc,four,130,mpfi,9.0,111,5000,21,27.0,13495
1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,dohc,four,130,mpfi,9.0,111,5000,21,,16500
2,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,ohcv,six,152,mpfi,9.0,154,5000,19,26.0,16500
3,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,ohc,four,109,mpfi,10.0,102,5500,24,30.0,13950
4,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,ohc,five,136,mpfi,8.0,115,5500,18,22.0,17450


## REMOVE IRRELEVANT DATA

In [11]:
# Let's first check if all columns seem relevant

print("Original columns:", df.columns.tolist())

# All columns appear relevant for automotive analysis, so we'll keep all

Original columns: ['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']


## REMOVE DUPLICATE DATA

In [13]:
# Before Removing duplicates Shape of Data

print(f"Before removing duplicates: {df.shape}")

df.drop_duplicates(inplace=True)

#After Removing duplicates

print(f"After removing duplicates: {df.shape}")

#Means No duplicate rows

Before removing duplicates: (205, 22)
After removing duplicates: (205, 22)


## REMOVE UNNECESSARY SPACES

In [15]:
# Strip whitespace from string columns

string_cols = df.select_dtypes(include='object').columns

df[string_cols] = df[string_cols].apply(lambda x: x.str.strip())

## HANDLE INCONSISTENT CAPITALIZATION

In [17]:

for col in string_cols:
    
    df[col] = df[col].str.title()  # Capitalize first letter of each word
    
df.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,...,engine-type,num-of-cylinders,engine-size,fuel-system,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,Alfa-Romero,Gas,Std,Two,Convertible,Rwd,Front,88.6,168.8,64.1,...,Dohc,Four,130,Mpfi,9.0,111,5000,21,27.0,13495
1,Alfa-Romero,Gas,Std,Two,Convertible,Rwd,Front,88.6,168.8,64.1,...,Dohc,Four,130,Mpfi,9.0,111,5000,21,,16500
2,Alfa-Romero,Gas,Std,Two,Hatchback,Rwd,Front,94.5,171.2,65.5,...,Ohcv,Six,152,Mpfi,9.0,154,5000,19,26.0,16500
3,Audi,Gas,Std,Four,Sedan,Fwd,Front,99.8,176.6,66.2,...,Ohc,Four,109,Mpfi,10.0,102,5500,24,30.0,13950
4,Audi,Gas,Std,Four,Sedan,4Wd,Front,99.4,176.6,66.4,...,Ohc,Five,136,Mpfi,8.0,115,5500,18,22.0,17450


## DATA TYPE CONVERSION

In [19]:
# Convert appropriate columns to categorical

categorical_cols = ['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
                    'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 
                    'fuel-system']

# Replace '?' with NaN first
for col in categorical_cols:
    df[col] = df[col].replace('?', np.nan)

# Then convert to categorical dtype
df[categorical_cols] = df[categorical_cols].astype('category')


# Convert numeric columns (some might be read as objects due to missing values)

numeric_cols = ['wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size',
               'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

## HANDLE MISSING OR NULL VALUES USING IMPUTATION

In [21]:
# For numeric columns, use median imputation

num_imputer = SimpleImputer(strategy='median')

df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])


# For categorical columns, use most frequent imputation

cat_imputer = SimpleImputer(strategy='most_frequent')

df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

## DEAL WITH OUTLIERS

In [23]:
def handle_outliers(df, columns):
    for col in columns:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
            print(f"Found {outliers.sum()} outliers in {col}")

            # Cap the values instead of removing
            df[col] = np.where(df[col] < lower_bound, lower_bound,
                               np.where(df[col] > upper_bound, upper_bound, df[col]))
        else:
            print(f"Column {col} not found in dataframe")
    return df

# Handle outliers in numeric columns

df = handle_outliers(df, numeric_cols)

Found 3 outliers in wheel-base
Found 4 outliers in length
Found 8 outliers in width
Found 0 outliers in height
Found 0 outliers in curb-weight
Found 10 outliers in engine-size
Found 28 outliers in compression-ratio
Found 6 outliers in horsepower
Found 2 outliers in peak-rpm
Found 2 outliers in city-mpg
Found 3 outliers in highway-mpg
Found 14 outliers in price


## STANDARDIZED DATA

In [25]:
# Standardize numerical columns using StandardScaler
scaler = StandardScaler()
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = scaler.fit_transform(df[num_cols])

# Save cleaned data
df.to_csv('cleaned_automotive_data.csv', index=False)

print("Data cleaning completed successfully!")
print("Final dataset shape:", df.shape)
df.head()

Data cleaning completed successfully!
Final dataset shape: (205, 22)


Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,...,engine-type,num-of-cylinders,engine-size,fuel-system,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,Alfa-Romero,Gas,Std,Two,Convertible,Rwd,Front,-1.723005,-0.427342,-0.858695,...,Dohc,Four,0.160196,Mpfi,-0.045031,0.228518,-0.262085,-0.649321,-0.571254,0.11873
1,Alfa-Romero,Gas,Std,Two,Convertible,Rwd,Front,-1.723005,-0.427342,-0.858695,...,Dohc,Four,0.160196,Mpfi,-0.045031,0.228518,-0.262085,-0.649321,-0.113506,0.57515
2,Alfa-Romero,Gas,Std,Two,Hatchback,Rwd,Front,-0.71759,-0.228766,-0.184978,...,Ohcv,Six,0.809329,Mpfi,-0.045031,1.440545,-0.262085,-0.958163,-0.723837,0.57515
3,Audi,Gas,Std,Four,Sedan,Fwd,Front,0.18558,0.21803,0.15188,...,Ohc,Four,-0.45943,Mpfi,1.23355,-0.025162,0.793462,-0.186058,-0.113506,0.187839
4,Audi,Gas,Std,Four,Sedan,4Wd,Front,0.117416,0.21803,0.248125,...,Ohc,Five,0.337232,Mpfi,-1.323612,0.341265,0.793462,-1.112584,-1.334167,0.719443
