In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error
from scipy.stats import randint, uniform
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt

# Load the datasets
df = pd.read_csv('clean_train.csv')
df['gears'].replace('Value Missing', np.nan, inplace=True)
df['gears'] = df['gears'].astype(float)

In [8]:
df['transmission'].value_counts()

transmission
Automatic    42080
other        12193
Name: count, dtype: int64

In [None]:
columns = ['milage', 'model_year', 'engine_power', 'engine_volume','engine_cylinder','accident','gears']

In [None]:
# Preprocess data
# Fill missing values in other columns (example: using mean for numerical and mode for categorical)
numerical_features = ['model_year', 'milage', 'engine_power', 'engine_volume', 'engine_cylinder', 'price']
categorical_features = ['brand', 'model', 'fuel_type', 'cylinder_shape', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X = df.drop(columns=['id', 'gears'])
X_preprocessed = preprocessor.fit_transform(X)

# Perform clustering
n_clusters = 5  # Example number of clusters, you might need to adjust this
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_preprocessed)

df['cluster'] = clusters

# Impute missing 'gears' based on clusters
def impute_gears(row, df):
    if pd.isna(row['gears']):
        cluster = row['cluster']
        common_gears = df[df['cluster'] == cluster]['gears'].mode()
        if not common_gears.empty:
            return common_gears.iloc[0]
    return row['gears']

df['gears'] = df.apply(impute_gears, args=(df,), axis=1)

# Drop the 'cluster' column after imputation
df = df.drop(columns=['cluster'])

# Display the DataFrame
print(df)