In [3]:
import pandas as pd
import numpy as np

In [5]:
def csv_loader(path):
    df = pd.read_csv(path)
    return df

filepath = r'D:\Generative AI\GIT\machine-learning\Multiple Linear Regression\co2_emission.csv'
df = csv_loader(filepath)
df

Unnamed: 0,Car,Model,Volume,Weight,CO2
0,Toyoty,Aygo,1000,790,99
1,Mitsubishi,Space Star,1200,1160,95
2,Skoda,Citigo,1000,929,95
3,Fiat,500,900,865,90
4,Mini,Cooper,1500,1140,105
5,VW,Up!,1000,929,105
6,Skoda,Fabia,1400,1109,90
7,Mercedes,A-Class,1500,1365,92
8,Ford,Fiesta,1500,1112,98
9,Audi,A1,1600,1150,99


In [6]:
def summarize_data(df):
    summary = {
        "missing": df.isnull().sum(),
        "non_null_counts": df.count().to_dict(),
        "dtypes": df.dtypes.to_dict(),
        "description": df.describe(include='all'),
        "columns": df.columns.tolist()
    }
    
    print(f"Missing Values:\n{summary['missing']}\n")
    print(f"Non Missing Values:\n{summary['non_null_counts']}\n")
    print(f"Dtypes:\n{summary['dtypes']}\n")
    print(f"Data Description:\n{summary['description']}\n")
    print(f"List of Columns:\n{summary['columns']}")
    return summary

data_summary = summarize_data(df)

Missing Values:
Car       0
Model     0
Volume    0
Weight    0
CO2       0
dtype: int64

Non Missing Values:
{'Car': 36, 'Model': 36, 'Volume': 36, 'Weight': 36, 'CO2': 36}

Dtypes:
{'Car': dtype('O'), 'Model': dtype('O'), 'Volume': dtype('int64'), 'Weight': dtype('int64'), 'CO2': dtype('int64')}

Data Description:
         Car   Model       Volume       Weight         CO2
count     36      36    36.000000    36.000000   36.000000
unique    17      35          NaN          NaN         NaN
top     Ford  Fiesta          NaN          NaN         NaN
freq       5       2          NaN          NaN         NaN
mean     NaN     NaN  1611.111111  1292.277778  102.027778
std      NaN     NaN   388.975047   242.123889    7.454571
min      NaN     NaN   900.000000   790.000000   90.000000
25%      NaN     NaN  1475.000000  1117.250000   97.750000
50%      NaN     NaN  1600.000000  1329.000000   99.000000
75%      NaN     NaN  2000.000000  1418.250000  105.000000
max      NaN     NaN  2500.000000

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [1]:
def detect_column_types(df, target):
    numerical = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical = df.select_dtypes(include=['object']).columns.tolist()
    date_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
    if target in numerical:
        numerical.remove(target)
    if target in categorical:
        categorical.remove(target)
    return numerical, categorical, date_cols

In [10]:
numerical, categorical, date_cols = detect_column_types(df, 'CO2')
print(numerical, categorical, date_cols)

['Volume', 'Weight'] ['Car', 'Model'] []


In [11]:
def preprocess_data(df, target, numerical, categorical):
    X = df.drop(columns=[target])
    y = df[target]

    # Drop columns with too many missing values (e.g., >50%)
    missing_ratio = X.isnull().mean()
    X = X.loc[:, missing_ratio < 0.5]
    numerical = [col for col in numerical if col in X.columns]
    categorical = [col for col in categorical if col in X.columns]

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_pipeline, numerical),
        ('cat', cat_pipeline, categorical)
    ])

    X_preprocessed = preprocessor.fit_transform(X)
    return X_preprocessed, y, preprocessor