In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import warnings
# warnings.filterwarnings("ignore")
# warnings.warn("this will not show")

plt.rcParams["figure.figsize"] = (10, 6)

sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Set it None to display all rows in the dataframe
# pd.set_option('display.max_rows', None)

# Set it to None to display all columns in the dataframe
pd.set_option('display.max_columns', None)

from warnings import simplefilter

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from os.path import exists
simplefilter('ignore')
SEED = 41
np.random.seed(SEED)

if not exists ('/kaggle/input/used-car-price-prediction-competition'):
    base_path = ('/kaggle/input/used-car-price-prediction-competition2')
else:
    base_path = ('/kaggle/input')

train = pd.read_csv(base_path + '/train.csv', index_col=0)
test = pd.read_csv(base_path + '/test.csv', index_col=0)
submission = pd.read_csv(base_path + '/Sample_Submition.csv')

In [None]:
# import numpy as np
# import pandas as pd

def reduce_mem_usage(df):
    """ Iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type == 'object' or col_type.name == 'category':
#             df[col] = df[col].astype('category')
            continue
        elif np.issubdtype(col_type, np.integer):
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
        elif np.issubdtype(col_type, np.floating):
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    # Explicitly handle datetime columns with timezone
    datetime_cols_with_tz = [col for col in df.columns if df[col].dtype == 'datetime64[ns, tz]']
    for col in datetime_cols_with_tz:
        df[col] = df[col].dt.tz_localize(None)  # Remove timezone information
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
import gc

# df = pd.read_csv("path of your dataset")
# df = reduce_mem_usage(df)


train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

gc.collect()

### Train Set

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.drop(columns=["id", "state", "posting_date", "lat", "long"]).duplicated().sum()

In [None]:
col_check = ['year', 'manufacturer', 'model', 'cylinders', 'fuel',
       'odometer', 'transmission', 'drive', 'size', 'type','price']

# Create a boolean mask for duplicated rows based on the specified columns
dupli_mask = train.duplicated(subset=col_check)

# Select the duplicated rows and display two instances of each
dupli_rows = train[dupli_mask]

dupli_rows

In [None]:
train[(train["manufacturer"]=="infiniti")&(train["price"]==37590)]

In [None]:
train.drop(dupli_rows.index, axis=0, inplace=True)

In [None]:
round(train.isnull().sum() / train.shape[0] * 100, 2).sort_values()

### Test Set

In [None]:
test

In [None]:
test.drop(columns='id').duplicated().sum()  # There are duplicated rows!!!

In [None]:
test.describe()

In [None]:
test.info()

In [None]:
round(test.isnull().sum() / test.shape[0] * 100, 2).sort_values()

## Data Cleaning

In [None]:
df = train.copy()

In [None]:
df.head(3)

#### Posting Date

In [None]:
# Convert the "posting_date" column to datetime with the corrected format (including time zone offset)
df['posting_date'] = pd.to_datetime(df['posting_date'], utc=True)
print(df["posting_date"].dtype)

# Set the time zone to the desired one, for example, 'America/New_York'
df['posting_date'] = df['posting_date'].dt.tz_convert('America/New_York')  # UTC -04:00

# Create separate columns for date, year, month, day, and time
df['posting_year'] = df['posting_date'].dt.year
df["age"] = df["posting_year"] - df["year"]
# df['date'] = df['posting_date'].dt.date
# df['month'] = df['posting_date'].dt.month
# df['day'] = df['posting_date'].dt.day
# df['time'] = df['posting_date'].dt.time

#### Transmission

In [None]:
df["transmission"].value_counts(dropna=False)

In [None]:
df['transmission'] = df['transmission'].replace('other', np.NaN)

#### Fuel

In [None]:
df["fuel"].value_counts(dropna=False)

In [None]:
df['fuel'] = df['fuel'].replace('other', np.NaN)

#### Type

In [None]:
df["type"].value_counts(dropna=False)

In [None]:
df['type'] = df['type'].replace('other', np.NaN)

#### Cylinders

In [None]:
# Use a regular expression to extract the numeric part from the "cylinders" column
df['cylinders'] = df['cylinders'].str.replace(r'(\d+)\s*cylinders', r'\1', regex=True)

# Replace "other" with 1
df['cylinders'] = df['cylinders'].replace('other', np.nan)

# Convert the "cylinders" column to numeric type
df['cylinders'] = pd.to_numeric(df['cylinders'], errors='coerce')

# The "cylinders" column now contains the desired numeric values and NaN for missing values

In [None]:
df["cylinders"].value_counts(dropna=False)

In [None]:
df = df[['manufacturer', 'model', 'title_status', 'transmission', 'fuel', 'odometer', 'state', 'lat', 'long',
         'year', "age", "posting_year", 'posting_date', 'price',   #, 'month', 'day', 'time', 'date'
         'cylinders','drive', 'size', 'type', 'condition', 'paint_color']]

In [None]:
df

## Missing Value Handling

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.dropna().sample(5)

In [None]:
def fill_missing(df, fill_col, use_col):
    """
    Parameters
    -------------------
    df : Dataset.
        Dataset name which has null value.

    fill_col : Text.
        The column name whose missing values will be filling.

    use_col : Text or List of Text.
        The referenced column name(s) to fill missing values.
    """
    if isinstance(use_col, str):
        # When 'use_col' is a single column
        df[fill_col] = df.groupby(use_col)[fill_col].ffill().bfill()
    elif isinstance(use_col, list):
        # When 'use_col' is a list of multiple columns
        df[fill_col] = df.groupby(use_col)[fill_col].ffill().bfill()
    else:
        raise ValueError("The 'use_col' parameter must be a single column name or a list of column names.")

#### Fuel Column

In [None]:
fill_missing(df, "fuel", "model")

In [None]:
df["fuel"].isnull().sum()

#### Latitude - Longitude Columns

In [None]:
df[["lat", "long"]].isnull().sum()

In [None]:
s_lat_Q1 = df.groupby('state')[['lat', "long"]].quantile(0.25)
s_lat_Q1.reset_index(inplace=True)
s_lat_Q3 = df.groupby('state')[['lat', "long"]].quantile(0.75)
s_lat_Q3.reset_index(inplace=True)

df2 = pd.merge(s_lat_Q1, s_lat_Q3, on="state", how="outer", suffixes=('_Q1', '_Q3'))
df2.reset_index(drop=True, inplace=True)

df2["IQR_lat"] = df2["lat_Q3"] - df2["lat_Q1"]
df2["IQR_long"] = df2["long_Q3"] - df2["long_Q1"]

lim = 1.5
df2["low_lat"] = df2["lat_Q1"] - lim * df2["IQR_lat"]
df2["low_long"] = df2["long_Q1"] - lim * df2["IQR_long"]
df2["hi_lat"] = df2["lat_Q3"] + lim * df2["IQR_lat"]
df2["hi_long"] = df2["long_Q3"] + lim * df2["IQR_long"]

df2.head(5)

In [None]:
df_modified = df.copy()

# Iterate through each row in df
for index, row in df.iterrows():
    state = row['state']
    lat = row['lat']
    long = row['long']

    # Find the corresponding row in df2 for the current state
    state_filter = df2['state'] == state
    state_row = df2[state_filter].iloc[0]

    # Check and assign np.nan to lat column
    if (state_row['low_lat'] <= lat <= state_row['hi_lat']):
        df_modified.at[index, 'lat'] = lat
    else:
        df_modified.at[index, 'lat'] = np.nan

    # Check and assign np.nan to long column
    if (state_row['low_long'] <= long <= state_row['hi_long']):
        df_modified.at[index, 'long'] = long
    else:
        df_modified.at[index, 'long'] = np.nan

df_modified.sample(5)

In [None]:
df_modified[["lat", "long"]].isnull().sum()

In [None]:
df = df_modified

In [None]:
state_lat_mean = df.groupby("state")["lat"].apply(lambda x: x.dropna().mean())
df["lat"] = df.apply(lambda row: state_lat_mean[row["state"]] if pd.isna(row["lat"]) else row["lat"], axis=1)

state_long_mean = df.groupby("state")["long"].apply(lambda x: x.dropna().mean())
df["long"] = df.apply(lambda row: state_long_mean[row["state"]] if pd.isna(row["long"]) else row["long"], axis=1)

In [None]:
df[["lat", "long"]].isnull().sum()

#### Manufacturer

In [None]:
fill_missing(df, "manufacturer", "model")

In [None]:
df["manufacturer"].isnull().sum()

#### Type

In [None]:
fill_missing(df, "type", ["manufacturer", "model", "fuel", "cylinders"])

In [None]:
df["type"].isnull().sum()

#### Drive

In [None]:
fill_missing(df, "drive", ["manufacturer", "model", "cylinders"])

In [None]:
df["drive"].isnull().sum()

#### Cylinders

In [None]:
df.loc[df['fuel'] == 'electric', 'cylinders'] = 0

In [None]:
fill_missing(df, "cylinders", ["manufacturer", "model", "fuel", "drive"])

In [None]:
df["cylinders"].isnull().sum()

#### Size

In [None]:
fill_missing(df, "size", ["manufacturer", "model", "type"])

In [None]:
df["size"].isnull().sum()

#### Tramsmission

In [None]:
df.loc[df['fuel'] == 'electric', 'transmission'] = "automatic"

In [None]:
fill_missing(df, "transmission", ["manufacturer", "model", "drive", "type"])

In [None]:
df["transmission"].isnull().sum()

## Outlier Detection

In [None]:
df_num = df.select_dtypes(include=['int', "float"])
df_num.columns

In [None]:
df_obj = df.select_dtypes(include='object')
df_obj.columns

### Numeric Columns

#### Odometer

In [None]:
df.drop(df[df['odometer'] > 2400000].index, axis=0, inplace=True)

#### Year

In [None]:
df.drop(df[df['year'] < 1925].index, axis=0, inplace=True)

#### Age

In [None]:
df.loc[df['age'] < 0, 'age'] = 0
# df.drop(df[df["age"] < 0].index, axis=0, inplace=True)

In [None]:
# df.drop(df[df['age'] > 76].index, axis=0, inplace=True)

#### Price

In [None]:
df.drop(df[df['price'] > 350000].index, axis=0, inplace=True)

In [None]:
corr = df[['odometer', 'year', 'age', 'price', 'cylinders']].corr()  #, 'month', 'day'

In [None]:
mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr, mask=mask, cmap='coolwarm', ax=None, linewidths=1.5, center=0, annot=True,
            fmt='.2f', square=True, xticklabels=(corr.index), yticklabels=(corr.columns))

plt.xticks(rotation=30)
plt.yticks(rotation=30)
plt.grid('off')
plt.show()

In [None]:
df.drop(["posting_year"], axis=1, inplace=True)  #, "month", "day", "date"

In [None]:
df.sort_values(by="posting_date", inplace=True)

In [None]:
df.reset_index(inplace=True)

#### Latitude and Longitude

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='long', y='lat', data=df, hue="state", legend=False);
plt.xlabel("")
plt.xticks([])
plt.ylabel("")
plt.yticks([])
plt.show()

### Object Columns

#### Model

In [None]:
# Get the not desired values from the tail of the value counts in the "model" column
drop_model = df["model"].value_counts(dropna=False).tail(15287).index

# Create a boolean mask to filter rows with not desired values in the 'model' column
mask_drop_model = df['model'].isin(drop_model)

# Drop rows where 'model' column has not desired values
df = df[~mask_drop_model]
df.shape

In [None]:
fill_missing(df, "model", ["manufacturer", "fuel", "drive", "size", "type"])

In [None]:
df.drop(["title_status", "condition", "paint_color"], axis=1, inplace=True)

#### Feature Engineering

In [None]:
df["km/year"] = np.where(df["age"] != 0, df["odometer"] / df["age"], 1 / df["age"])
df.drop(df[df["km/year"] == np.inf].index, axis=0, inplace=True)
# df["km/year"].replace([np.inf, -np.inf], 2400000, inplace=True)

def calculate_power(row):
    fuel = row['fuel']
    cylinders = row['cylinders']
    age = row["age"]

    if fuel == 'gas':
        return (3 * 10 * cylinders) / age
    elif fuel == 'diesel':
        return (4 * 10 * cylinders) / age
    elif fuel == 'hybrid':
        return (1 * 10 * cylinders) / age
    elif fuel == 'electric':
        return (2 * 10 * cylinders) / age
    else:
        return None  # Handle unknown fuel types if needed

# Apply the custom function to create the "pow" column
df['pow'] = df.apply(calculate_power, axis=1)

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df_num = df.select_dtypes(include=['int', "float"])
df_num.columns

In [None]:
df_obj = df.select_dtypes(include='object')
df_obj.columns

In [None]:
df

In [None]:
df.isnull().sum().sort_values(ascending=False)

## Modeling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import cross_validate

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
for col in df_obj:
    print(f"{col:<20}:", df[col].nunique())

In [None]:
df_num.corr()[(df_num.corr()>= 0.9) & (df_num.corr() < 1)].any().any()

In [None]:
df_num.corr()[(df_num.corr()<= -0.9) & (df_num.corr() > -1)].any().any()

In [None]:
mask = np.triu(np.ones_like(df_num.corr(), dtype=bool))

sns.heatmap(df_num.corr(), mask=mask, cmap='coolwarm', ax=None, linewidths=1.5, center=0, annot=True,
            fmt='.2f', square=True, xticklabels=(df_num.corr().index), yticklabels=(df_num.corr().columns))

plt.xticks(rotation=30)
plt.yticks(rotation=30)
plt.grid('off')
plt.show()

In [None]:
df.sample(3)

In [None]:
def train_val(model, X_train, y_train, X_test, y_test):

    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    scores = {"train": {"R2" : r2_score(y_train, y_train_pred),
    "mae" : mean_absolute_error(y_train, y_train_pred),
    "mse" : mean_squared_error(y_train, y_train_pred),
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))},

    "test": {"R2" : r2_score(y_test, y_pred),
    "mae" : mean_absolute_error(y_test, y_pred),
    "mse" : mean_squared_error(y_test, y_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))}}

    return pd.DataFrame(scores)

### CatBoost

In [None]:
X = df.drop(["index", "year", "posting_date", "price"], axis=1)  #, "model", "time"
y = df.price

In [None]:
cat = X.select_dtypes("object").columns
cat

In [None]:
import catboost

In [None]:
# cb_learn_rate = 0.006
# n_iterations = 80000
# early_stop_rounds = 400

# opt_catboost_params = {'iterations' : n_iterations,
#                        'learning_rate' : cb_learn_rate,
#                        'depth': 7,
#                        'bootstrap_type' : 'Bernoulli',
#                        'random_strength': 1,
#                        'min_data_in_leaf': 10,
#                        'l2_leaf_reg': 3,
#                        'loss_function' : 'RMSE', 
#                        'eval_metric' : 'RMSE',
#                        'grow_policy' : 'Depthwise',
#                        'max_bin' : 1024, 
#                        'model_size_reg' : 0,
#                        'task_type' : 'GPU',
#                        'od_type' : 'IncToDec',
#                        'od_wait' : 100,
#                        'metric_period' : 500,
#                        'verbose' : 500,
#                        'subsample' : 0.8,
#                        'od_pval' : 1e-10,
#                        'max_ctr_complexity' : 8,
#                        'has_time': False,
#                        'simple_ctr' : 'FeatureFreq',
#                        'combinations_ctr': 'FeatureFreq',
#                        'random_seed' : 13}

In [None]:
# cb_reg = CatBoostRegressor(**opt_catboost_params)

# cb_reg.fit(X_train, y_train, eval_set=(X_val, y_val), 
#            use_best_model=True, plot=True, 
#            early_stopping_rounds=early_stop_rounds)

In [None]:
from catboost import CatBoost

# cb = CatBoost(params={'iterations':100, 'verbose':10, 'loss_function':'RMSE', 'task_type':"GPU", 'devices':'0'})

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
# cb.fit(X_train, y_train, plot=True)
# cb.set_feature_names(X.feature_names)

In [None]:
# train_preds = cb.predict(X_train)
# val_preds = cb.predict(X_val)

In [None]:
from catboost.utils import eval_metric

# print("Train R2 : %.2f"%eval_metric(Y_train, train_preds, "R2")[0])
# print("Validation  R2 : %.2f"%eval_metric(Y_val, val_preds, "R2")[0])

In [None]:
from catboost import utils

gpu_cnt = utils.get_gpu_device_count()

print("Number of GPU Count : ", gpu_cnt)

In [None]:
# cb.shrink(ntree_end=50)

# train_preds = cb.predict(X_train)
# val_preds = cb.predict(X_val)

# print("Train R2 : %.2f"%eval_metric(Y_train, train_preds, "R2")[0])
# print("Validation  R2 : %.2f"%eval_metric(Y_val, val_preds, "R2")[0])

**CatBoost Regressor**

In [None]:
from catboost import CatBoostRegressor

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

cb = CatBoostRegressor(iterations=100, verbose=10, task_type="GPU", devices="0")

cb.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=5, plot=True, 
       save_snapshot=True, snapshot_file="cb_snapshots.temp", snapshot_interval=1)

cb.set_feature_names(X.feature_names)

train_preds = cb.predict(X_train)
val_preds = cb.predict(X_val)


from catboost.utils import eval_metric

print("Train R2 : %.2f"%cb.score(X_train, y_train))
print("\nValidation  R2 : %.2f"%cb.score(X_val, y_val))

#### Grid Search

In [None]:
cb = CatBoost(task_type='GPU')

params = {
            'iterations':[10, 50],
            'learning_rate':[0.01, 0.1],
            'bootstrap_type':['Bayesian', 'Bernoulli', 'No']
}

search_results = cb.grid_search(params, X_train, y_train, cv=5)

print("\nBest Params : ", search_results['params'])

train_preds = cb.predict(X_train)
val_preds = cb.predict(X_val)

print("Train R2 : %.2f"%eval_metric(y_train, train_preds, "R2")[0])
print("\nValidation  R2 : %.2f"%eval_metric(y_val, val_preds, "R2")[0])

In [None]:
cv_results = pd.DataFrame(search_results["cv_results"])

cv_results.head()

In [None]:
cb = CatBoostRegressor()

params = {
            'iterations':[10, 50],
            'learning_rate':[0.01, 0.1],
            'bootstrap_type':['Bayesian', 'No']
}

search_results = cb.grid_search(params, X_train, y_train, cv=5, )

print("\nBest Params : ", search_results['params'])

#### Pool

In [None]:
from catboost import Pool

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_pool = Pool(X_train, y_train, cat.tolist())
val_pool = Pool(X_val, y_val, cat.tolist())

cb = catboost.train(pool=train_pool, eval_set=val_pool,
                    params={
                        'iterations':300,
                        'verbose':10,
                        'loss_function':'RMSE',
                                                 })

print()
print(cb)

cb.set_feature_names(list(X.columns))

cb.shrink(ntree_end=90)

# train_preds = cb.predict(train_pool)
# val_preds = cb.predict(val_pool)

# from catboost.utils import eval_metric

# print("Train R2 : %.2f"%eval_metric(y_train, train_preds, "R2")[0])
# print("\nValidation  R2 : %.2f"%eval_metric(y_val, val_preds, "R2")[0])

from sklearn.metrics import r2_score

# Calculate R2 scores using scikit-learn's r2_score function
train_r2 = r2_score(y_train, train_preds)
val_r2 = r2_score(y_val, val_preds)

# Print the R2 scores
print("Train R2 : %.2f" % train_r2)
print("Validation R2 : %.2f" % val_r2)

In [None]:
from catboost import Pool
from catboost.utils import eval_metric

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_pool = Pool(X_train, y_train, cat.tolist())
val_pool = Pool(X_val, y_val, cat.tolist())

cb = CatBoost({'verbose':10, 'iterations':100, 'task_type':'GPU', 'devices':'0'})
cb.fit(train_pool, eval_set=val_pool, early_stopping_rounds=5, plot=True, 
       save_snapshot=True, snapshot_file="catboost_snapshots.temp",snapshot_interval=1)
cb.set_feature_names(list(X.columns))

# train_preds = cb.predict(train_pool)
# val_preds = cb.predict(val_pool)

# print("Train R2 : %.2f"%eval_metric(y_train, train_preds, "R2")[0])
# print("\nValidation  R2 : %.2f"%eval_metric(y_val, val_preds, "R2")[0])

from sklearn.metrics import r2_score

# Calculate R2 scores using scikit-learn's r2_score function
train_r2 = r2_score(y_train, train_preds)
val_r2 = r2_score(y_val, val_preds)

# Print the R2 scores
print("Train R2 : %.2f" % train_r2)
print("Validation R2 : %.2f" % val_r2)

In [None]:
sorted_feature_importance = cb.feature_importances_.argsort()

plt.barh(X.feature_names[sorted_feature_importance], 
        cb.feature_importances_[sorted_feature_importance], 
        color='purple')

plt.xlabel("CatBoost Feature Importance")

In [None]:
# cb1.compare(cb2, data=Pool(X_val,y_val), metrics=["R2", "RMSE", "MAE"])

### ANN Model

In [None]:
X = df.drop(["index", "model", "year", "posting_date", "price"], axis=1)  #, "time"
y = df.price

In [None]:
X.columns

In [None]:
cat = X.select_dtypes("object").columns
cat

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

print("Train features shape : ", X_train.shape)
print("Train target shape   : ", y_train.shape)
print("Test features shape  : ", X_test.shape)
print("Test target shape    : ", y_test.shape)

In [None]:
X_train.columns

In [None]:
X_test.columns

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder  #, LabelEncoder
# from sklearn import preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler  #, RobustScaler

In [None]:
from sklearn.compose import make_column_transformer

In [None]:
ohe_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
column_trans_minmax = make_column_transformer((ohe_enc, cat), remainder=MinMaxScaler())
column_trans_minmax

In [None]:
X_train_mm = column_trans_minmax.fit_transform(X_train)
X_test_mm = column_trans_minmax.transform(X_test)

In [None]:
ohe_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)  #, unknown_value=-1
column_trans_standart = make_column_transformer((ohe_enc, cat), remainder=StandardScaler())

In [None]:
X_train_st = column_trans_standart.fit_transform(X_train)
X_test_st = column_trans_standart.transform(X_test)

In [None]:
X_train_mm.shape  #X_train_st.shape

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
from tqdm import tqdm
# from keras_tqdm import TQDMCallback
from tqdm.keras import TqdmCallback

In [None]:
# from tensorflow.keras import backend as K

In [None]:
# def rmse_func(y_true, y_pred):
#         return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
SEED = 42
tf.random.set_seed(SEED)

modelANN = Sequential()

modelANN.add(Dense(200, activation="relu", input_dim=X_train_mm.shape[1]))
modelANN.add(Dense(200, activation="relu"))
modelANN.add(Dense(180, activation="relu"))
modelANN.add(Dense(180, activation="relu"))
modelANN.add(Dense(160, activation="relu"))
modelANN.add(Dense(160, activation="relu"))
modelANN.add(Dense(140, activation="relu"))
modelANN.add(Dense(120, activation="relu"))
modelANN.add(Dense(80, activation="relu"))
modelANN.add(Dense(60, activation="relu"))
modelANN.add(Dense(60, activation="relu"))
modelANN.add(Dense(40, activation="relu"))
modelANN.add(Dense(40, activation="relu"))
modelANN.add(Dense(20, activation="relu"))
modelANN.add(Dense(10, activation="relu"))
modelANN.add(Dense(4, activation="relu"))
modelANN.add(Dense(4, activation="relu"))
modelANN.add(Dense(1, activation="linear"))  #

modelANN.summary()

modelANN.compile(optimizer=Adam(learning_rate=0.03), loss="mae")  #rmse_func
                                #, metrics=[tf.keras.metrics.MeanSquaredError()]"mse", "mae"

In [None]:
earlyStoppingCallback = EarlyStopping(monitor='val_loss', patience=35, restore_best_weights=True)  #val_mean_squared_error

modelANN.fit(x=X_train_mm,
          y=y_train,
          validation_split=0.2,
          batch_size=1024,
          epochs=1000,
          verbose=0,
          callbacks=[earlyStoppingCallback, TqdmCallback()])

In [None]:
loss_model = pd.DataFrame(modelANN.history.history)
loss_model.plot();

In [None]:
y_pred = modelANN.predict(X_test_mm)

In [None]:
train_val(modelANN, X_train_mm, y_train, X_test_mm, y_test)

#### Final Model

In [None]:
X_train.columns

In [None]:
X_new = df[['age', 'cylinders', 'fuel', 'odometer', 'long', 'manufacturer', 'model', 'lat']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.1, random_state=101)

In [None]:
cat_new = X_new.select_dtypes("object").columns

ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

column_trans = make_column_transformer((ord_enc, cat_new),
                                       remainder='passthrough')

operations = [("OrdinalEncoder", column_trans),
              ("model_final",
               XGBRegressor(colsample_bytree=0.9,  #1
                            learning_rate=0.04,  #0.3
                            max_depth=12,  #6
                            n_estimators=15000,  #700
                            subsample=1,  #0.8
                            max_leaves=28,
                            objective='reg:squarederror',
                            random_state=42,
                            tree_method='gpu_hist',
#                             sampling_method="gradient_based",  #Used only by `gpu_hist` tree method.`uniform`, `gradient_based`
#                             grow_policy="lossguide"  #"depthwise"
#                             booster="gbtree"  #gbtree,
                                                #dart:performs dropouts. Use preds=bst.predict(dtest, iteration_range=(0, num_round)) for test set
                           ))]

model_p_f = Pipeline(steps=operations)
model_p_f.fit(X_train, y_train)  #, eval_metric="rmse", verbose=True, early_stopping_rounds=10, eval_set=eval_set

train_val(model_p_f, X_train, y_train, X_test, y_test)

In [None]:
# Final Model Results:

# 	train	test
# R2	0.83	0.78
# mae	3807.07	4148.53
# mse	36898028.85	45882339.25
# rmse	6074.37	6773.65

## Prediction

### Preprocess of Test Set

In [None]:
test.describe()

In [None]:
test.info()

In [None]:
X_new.columns

In [None]:
# Convert the "posting_date" column to datetime with the corrected format (including time zone offset)
test['posting_date'] = pd.to_datetime(test['posting_date'], utc=True)

# Set the time zone to the desired one, for example, 'America/New_York'
test['posting_date'] = test['posting_date'].dt.tz_convert('America/New_York')  # UTC -04:00

test['posting_year'] = test['posting_date'].dt.year
test["age"] = test["posting_year"] - test["year"]

In [None]:
test["posting_date"].min(), test["posting_date"].max(), test["posting_date"].max() - test["posting_date"].min()

#### Transmission

In [None]:
test["transmission"].value_counts(dropna=False)

In [None]:
test['transmission'] = test['transmission'].replace('other', np.NaN)

In [None]:
fill_missing(test, "transmission", ["manufacturer", "model", "drive", "type"])

In [None]:
test["transmission"].isnull().sum()

#### Fuel

In [None]:
test["fuel"].value_counts(dropna=False)

In [None]:
test['fuel'] = test['fuel'].replace('other', np.NaN)

In [None]:
fill_missing(test, "fuel", "model")

In [None]:
test["fuel"].isnull().sum()

#### Type

In [None]:
test["type"].value_counts(dropna=False)

In [None]:
test['type'] = test['type'].replace('other', np.NaN)

In [None]:
fill_missing(test, "type", ["manufacturer", "model", "fuel", "cylinders"])

In [None]:
test["type"].isnull().sum()

#### Cylinders

In [None]:
# Use a regular expression to extract the numeric part from the "cylinders" column
test['cylinders'] = test['cylinders'].str.replace(r'(\d+)\s*cylinders', r'\1', regex=True)

# Replace "other" with 1
test['cylinders'] = test['cylinders'].replace('other', np.nan)

# Convert the "cylinders" column to numeric type
test['cylinders'] = pd.to_numeric(test['cylinders'], errors='coerce')

# The "cylinders" column now contains the desired numeric values and NaN for missing values

In [None]:
test.loc[test['fuel'] == 'electric', 'cylinders'] = 0

In [None]:
fill_missing(test, "cylinders", ["manufacturer", "model", "fuel", "drive"])

In [None]:
test["cylinders"].isnull().sum()

#### Latitude and Longitude

In [None]:
lats = pd.DataFrame(test.groupby('state')['lat'].count())
lats = lats.reset_index()
longs = pd.DataFrame(test.groupby('state')['long'].count())
longs = longs.reset_index()

plt.figure(figsize=(16, 20))

plt.subplot(211)
plt.boxplot(lats,
#             whis=2.5
           )

plt.subplot(212)
plt.boxplot(longs,
#             whis=2.5
           )

plt.show()

In [None]:
# Group by the "state" column and calculate the mode of "lat" and "long" for each state group
state_mode_lat = test.groupby('state')['lat'].transform(lambda x: x.mode().iloc[0])
state_mode_long = test.groupby('state')['long'].transform(lambda x: x.mode().iloc[0])

# Fill missing values in "lat" and "long" columns with the mode of each state group
test['lat'] = test['lat'].fillna(state_mode_lat)
test['long'] = test['long'].fillna(state_mode_long)

In [None]:
test[["lat", "long"]].isnull().sum()

#### Manufacturer

In [None]:
fill_missing(test, "manufacturer", "model")

In [None]:
test["manufacturer"].isnull().sum()

#### Drive

In [None]:
fill_missing(test, "drive", ["manufacturer", "model", "fuel", "cylinders"])

In [None]:
test["drive"].isnull().sum()

#### Size

In [None]:
fill_missing(test, "size", ["manufacturer", "model", "type"])

In [None]:
test["size"].isnull().sum()

In [None]:
sns.heatmap(test.isnull(), yticklabels=False, cbar=False, cmap='viridis');
plt.xticks(rotation=45)
plt.show()

In [None]:
test.describe()

In [None]:
# test.drop(test[test['odometer'] > 1000000].index, axis=0, inplace=True)

In [None]:
test.loc[test['odometer'] > 1000000, 'odometer'] = 1000000

In [None]:
# test.drop(test[test['year'] < 1948].index, axis=0, inplace=True)

In [None]:
test.loc[test['year'] < 1948, 'year'] = 1948

In [None]:
# test.drop(test[test["age"] < 0].index, axis=0, inplace=True)

In [None]:
test['age'] = test['age'].replace(-1, 0)

In [None]:
# test.drop(test[test['age'] > 68].index, axis=0, inplace=True)

In [None]:
# test.loc[test['age'] > 67, 'age'] = 67

In [None]:
corr = test[['odometer', 'year', 'age', 'cylinders']].corr()

In [None]:
mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr, mask=mask, cmap='coolwarm', ax=None, linewidths=1.5, center=0, annot=True,
            fmt='.2f', square=True, xticklabels=(corr.index), yticklabels=(corr.columns))

plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.grid('off')
plt.show()

In [None]:
test.columns

In [None]:
test.drop(["posting_year", "title_status", "condition", "paint_color"], axis=1, inplace=True)

In [None]:
# test.sort_values(by="posting_date", inplace=True)

In [None]:
test.reset_index(inplace=True)

In [None]:
test.isnull().sum().sort_values(ascending=False)

In [None]:
test.shape, submission.shape

In [None]:
X_new.columns

In [None]:
test.columns

In [None]:
test["pred"] = model_p_f.predict(test[X_new.columns])

In [None]:
test["pred"].describe()

In [None]:
test.info()

In [None]:
plt.figure(figsize=(14, 8))
sns.histplot(x='pred', data=test, bins=35, legend=False);
plt.xlabel("Predictions")
plt.show()

In [None]:
test[test["pred"] < 0]

In [None]:
min_pred = test[test["pred"] > 0].groupby(["fuel", "cylinders"])["pred"].min()

test = test.merge(min_pred.reset_index(), on=["fuel", "cylinders"], how="left", suffixes=("", "_min"))

replace_negative = lambda row: row["pred_min"] if row["pred"] < 0 else row["pred"]

# Apply the lambda function to update the "pred" column
test["pred"] = test.apply(replace_negative, axis=1)

# Drop the auxiliary columns used for merging
test.drop(columns=["pred_min"], inplace=True)

In [None]:
test.info()

## Submission

In [None]:
submission

In [None]:
submission = test[["id", "pred"]]
submission

In [None]:
submission.info()

In [None]:
submission.describe()

In [None]:
submission.to_csv("submission.csv", index=False)