In [None]:
# To Perform Regex Operation
import re 

# Will return string containing numbers
def find_number(text):
    num = re.findall(r'[0-9]+',text)
    return "".join(num)

# For Feature Engineering
def feature_engineering(cars):
    # First seven columns are relevant
    cars = cars.iloc[:,:7]

    # Giving Proper names to features
    cars.rename(columns = {'Title':'Name', 'cvakb':'Variant', 'cvakb1':'Transmission', 'bvr0c':'km_driven', 'bvr0c2':'Owner_Type', 'bvr0c3':'Fuel', '_7udzz':'Price'}, inplace = True)

    # Extracting only numbers
    cars["Price"] = cars["Price"].apply(lambda x: find_number(x))

    # Extracting year of purchase from Name
    cars["Year_Purchased"] = cars["Name"].str.split().str.slice(start=0,stop=1).str.join(' ')

    # Extracting name excluding year of purchase
    cars["Name"] = cars["Name"].str.split().str.slice(start=1,stop=3).str.join(' ')

    # Removing "km"
    cars["km_driven"] = cars["km_driven"].str.split().str.slice(start=0,stop=1).str.join(' ')

    # Extracting only numbers
    cars["km_driven"] = cars["km_driven"].apply(lambda x: find_number(x))

    # Removing Transmission type from the end of Variant
    cars["Variant"] = cars["Variant"].str.rsplit(' ',1).str[0]

    # Converting features to int
    cars = cars.astype({"km_driven":"int","Price":"int", "Year_Purchased":"int"})

    # Deriving Age of Vehical from Year of Purchase
    cars["Age"] = date.today().year - cars["Year_Purchased"]
    cars.drop(['Year_Purchased'], axis=1, inplace=True)

    return cars

In [None]:
shape = cars.shape
print(f"There are \033[1m {shape[0]} rows\patterns \033[0m and \033[1m{shape[1]} features\033[0m.")

In [None]:
[[features,cars[features].isnull().sum()] for features in cars.columns if cars[features].isnull().sum()>0]

In [None]:
fig = sns.set(rc={'figure.figsize': (8, 5)})
plt.title("Heat Map for Missing Values")
sns.heatmap(cars.isnull(),yticklabels=False,cbar=False,cmap='viridis')
plt.show()

In [None]:
data.columns = data.columns.str.replace('!', 'text')
data.columns = data.columns.str.replace('0', 'index')


In [None]:
cars.dropna(inplace=True)

In [None]:
fig = sns.set(rc={'figure.figsize': (10, 8)})
plt.title("Countplot Owner Type Vs Number of Cars")
sns.countplot(x ='Owner_Type', data = cars)
plt.show()

In [None]:
fig = sns.set(rc={'figure.figsize': (10, 8)})
plt.title("Type of Owner Vs Number of cars")
plt.pie(cars['Owner_Type'].value_counts(),labels=cars['Owner_Type'].unique(),pctdistance=1.1, labeldistance=1.2,autopct='%.2f')
plt.show()

In [None]:
fig = sns.set(rc={'figure.figsize': (10, 8)})
plt.title("Transmission Vs Selling Price")
sns.barplot(x='Transmission',y='Price',data=cars,palette='spring')
plt.show()

In [None]:
fig = sns.set(rc={'figure.figsize': (10, 8)})
plt.title("Fuel Vs Number of Cars")
sns.countplot(x ='Fuel', data = cars)
plt.show()

In [None]:
fig = sns.set(rc={'figure.figsize': (10, 8)})
plt.title("Fuel Vs Price")
sns.barplot(x='Fuel',y='Price',data=cars,palette='spring')
plt.show()

outliers

In [None]:
cars['zscore'] = (cars['Price'] - cars['Price'].mean()) / cars['Price'].std()
cars.head()

In [None]:
X = cars.drop(columns =['Price'])
y = cars['Price']

In [None]:
ohe = OneHotEncoder()
ohe.fit(X[['Name','Variant','Transmission','Owner_Type','Fuel']])

In [None]:
def metrics(y_test, y_pred, X_train):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Number of rows
    n = len(X_train)

    # Number of Independent Features
    k = len(X_train.columns)

    adj_r2 = 1- ((1-r2) * (n-1)/(n-k-1))

    dict_ = {
        "MAE": [format_float(mae)],
        "MSE": [format_float(mse)],
        "RMSE": [format_float(rmse)],
        "R2": [(r2)],
        "Adjusted-R2": [(adj_r2)]
    }

    results = pd.DataFrame(dict_)
    results.index = ["Values"]

    return results

In [None]:
metrics_df, lr = train_model(X, y, column_trans, scaler, lr)
metrics_df

In [None]:
metrics_df, ridge = train_model(X, y, column_trans, scaler, ridge)
metrics_df

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap="Blues")