In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder

from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

## 1) problem statement

## 2) Data quality check

### data gathering and data analysis

In [3]:
df = pd.read_csv("autos_dataset.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'autos_dataset.csv'

In [None]:
df.info()

In [None]:
df.boxplot()

## 3)EDA

In [None]:
# df["normalized-losses"].value_counts().sort_values()

In [None]:
df["normalized-losses"].replace({"?":np.nan},inplace=True)

In [None]:
# df["bore"].value_counts().sort_values()

In [None]:
df["bore"].replace({"?":np.nan},inplace=True)

In [None]:
# to replace all "?" in df
df.replace({"?":np.nan},inplace=True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df[df["engine-size"]==326]["make"].value_counts()

In [None]:
df[df["compression-ratio"]==23]["make"].value_counts()

In [None]:
df[["engine-size"]].boxplot()

In [None]:
df[["compression-ratio"]].boxplot()

## 4)Feature Engineering

### outliers 

In [None]:
# df["engine-size"].median()
# df["compression-ratio"].median()

In [None]:
q1 = df["engine-size"].quantile(0.25)
q2 = df["engine-size"].quantile(0.75)
iqr = q2 - q1

lower_q = iqr - 1.5*q1
upper_q = iqr + 1.5*q2


In [None]:
# calculate engine size median

Engine_size_median = df[(df["engine-size"]<upper_q) | (df["engine-size"]>lower_q)]["engine-size"].median()
Engine_size_median  

In [None]:
# apply engine size median to all outlier

df.loc[(df["engine-size"]>upper_q) | (df["engine-size"]<lower_q),"engine-size"] = Engine_size_median

In [None]:
q1 = df["compression-ratio"].quantile(0.25)
q2 = df["compression-ratio"].quantile(0.75)
iqr = q2 - q1

lower_q = iqr - 1.5*q1
upper_q = iqr + 1.5*q2

In [None]:
#calculate compression_ratio_median

compression_ratio_median = df[(df["compression-ratio"]<upper_q) | (df["compression-ratio"]>lower_q)]["compression-ratio"].median()
compression_ratio_median

In [None]:
# apply compression_ratio_median to all outlier

df.loc[(df["compression-ratio"]>upper_q) | (df["compression-ratio"]<lower_q),"compression-ratio"]= compression_ratio_median

In [None]:
# df.describe()

In [None]:
df.info()

#### filling missing file

In [None]:
df["normalized-losses"] = df["normalized-losses"].fillna(df["normalized-losses"].median()).astype(int)
df["num-of-doors"]      = df["num-of-doors"].fillna(df["num-of-doors"].mode()[0])
df["bore" ]             = df["bore"].fillna(df["bore"].median()).astype(float)
df["stroke"]            = df["stroke"].fillna(df["stroke"].median()).astype(float)
df["horsepower"]        = df["horsepower"].fillna(df["horsepower"].median()).astype(int)
df["peak-rpm"]          = df["peak-rpm"].fillna(df["peak-rpm"].median()).astype(int)


#### for target column price >>>>> as we can not put values randomly for price it is depend on company make

In [None]:
# df.info()

In [None]:
df[df["price"].isna()]

In [None]:
cond_list=[(df["price"].isna()) & (df["make"]=="audi"),
             (df["price"].isna()) & (df["make"]=="isuzu"),
             (df["price"].isna()) & (df["make"]=="porsche")]

In [None]:
choise_list  =[df.loc[(df["make"]=="audi")]["price"].sort_values(ascending=False).iloc[0],
            df.loc[(df["make"]=="isuzu")]["price"].sort_values(ascending=False).iloc[0],
            df.loc[(df["make"]=="porsche")]["price"].sort_values(ascending=False).iloc[0]]

In [None]:
# df.loc[(df["make"]=="porsche")]["price"].sort_values(ascending=False).iloc[0]

In [None]:
df["price"]=np.select(cond_list,choise_list,default=df["price"]).astype(int)

In [None]:
df.info()

In [None]:
df1=df.copy()

In [None]:
df1.head(1)

### Encoding

In [None]:
df["engine-location"].unique()  #to check values inside

In [None]:
df["aspiration"].unique()       #to check values inside

In [None]:
df["num-of-doors"].unique()     #to check values inside

In [None]:
df["fuel-type"].unique()        #to check values inside

In [None]:
# by unsing replce

df["engine-location"].replace({'front':0, 'rear':1},inplace=True)
df["aspiration"].replace({'std':0, 'turbo':1},inplace=True)
df["num-of-doors"].replace({'two':2, 'four':4},inplace=True)
df["fuel-type"].replace({'gas':0, 'diesel':1},inplace=True)

#### Ordinal  Encoder

In [None]:
df["drive-wheels"].unique()            #to check values inside

In [None]:
df["num-of-cylinders"].unique()        #to check values inside

In [None]:
OE = OrdinalEncoder(categories=[['rwd', 'fwd', '4wd']])
df["drive-wheels"] = OE.fit_transform(df[["drive-wheels"]])

In [None]:
OE = OrdinalEncoder(categories=[['two', 'three','four','five','six','eight','twelve']])
df_oe2=df["num-of-cylinders"]= OE.fit_transform(df[["num-of-cylinders"]])


### dummies

In [None]:
df["body-style"].unique()

In [None]:
df["make"].unique()

In [None]:
df["engine-type"].unique()

In [None]:
df["fuel-system"].unique()

In [None]:
df = pd.get_dummies(df,columns=["fuel-system","engine-type","make","body-style"])

In [None]:
df.head(1)

## 5) feture selection

In [None]:

# df.drop("price",axis=1,inplace=True)
# df.head(1)

In [None]:
df.corr()

In [None]:
# sns.heatmap(df1.corr(),annot=True)

In [None]:
#sample calculation for first column 
vif = variance_inflation_factor(df.to_numpy(),0)
vif

In [None]:

# vif_list=[]                                                # to create VIF value list

# for i in range(df.shape[1]):
#     vif = variance_inflation_factor(df.to_numpy(),i)
#     vif_list.append(vif)
# vif_list

In [None]:
# df_vif = pd.DataFrame()                                     # to create VIF list into data frame
# df_vif["features"] = df_ind.columns
# df_vif["VIF"]      = vif_list
# df_vif

In [None]:
# df_vif["VIF"].sort_values().plot(kind="barh")           # VIF value bar chart

In [None]:
df.head(1)

## 6) evaluation

In [None]:
x = df.drop("price",axis=1)
y = df["price"]

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size= 0.2, random_state=42)

### Training data

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train,y_train)
y_pred_train   = lr_model.predict(x_train)

In [None]:
MSE = mean_squared_error(y_train, y_pred_train)
MAE = mean_absolute_error(y_train, y_pred_train)
# r2_score = r2_score(y_train, y_pred1)
r2_score = lr_model.score(x_train,y_train)
# ad_r2_score = 1 - (((1-r2_score)*(x_train.shape[0]-1))/(x_train.shape[0]-x_train.shape[1]-1))

In [None]:
print(f"MES={MSE}\nMAE={MAE}\nr2 score={r2_score}\nadjusted r2 score={ad_r2_score}")

### Testing data

In [None]:
lr_model1 = LinearRegression()
lr_model1.fit(x_test,y_test)
y_pred_test   = lr_model1.predict(x_test)

In [None]:
MSE1 = mean_squared_error(y_test, y_pred_test)
MAE1 = mean_absolute_error(y_test, y_pred_test)
# r2_score1 = r2_score(y_test, y_pred)
r2_score1 = lr_model1.score(x_test,y_test)
# ad_r2_score1 = 1 - (((1-r2_score)*(x_test.shape[0]-1))/(x_test.shape[0]-x_test.shape[1]-1))

In [None]:
print(f"MES={MSE1}\nMAE={MAE1}\nr2 score={r2_score1}\nadjusted r2 score={ad_r2_score1}")

### 7) user input

#### here user input is df_org()

In [None]:
# x['drive-wheels'].value_counts().to_dict()

In [None]:
# df_org['drive-wheels'].value_counts().to_dict()

In [None]:
drive_wheels_value     = {'fwd': 1.0, 'rwd': 0.0, '4wd': 2.0}

In [None]:
# x['num-of-cylinders'].value_counts().to_dict()

In [None]:
# df_org['num-of-cylinders'].value_counts().to_dict()

In [None]:
num_of_cylinders_value ={'four': 2.0,'six': 4.0,'five': 3.0,'eight': 5.5,'two': 0.0,'three': 1.0,'twelve': 6.0}

In [None]:
engine_location_value={'front':0, 'rear':1}
aspiration_value     ={'std':0, 'turbo':1}
num_of_doors_value   ={'two':2, 'four':4}
fuel_type_value     ={'gas':0, 'diesel':1}

In [None]:
encoded_columns = {"engine_location_value":engine_location_value,
                  "aspiration_value" :aspiration_value,
                  "num_of_doors_value":num_of_doors_value,
                   "fuel_type_value":fuel_type_value,
                   "drive_wheels_value":drive_wheels_value,
                   "num_of_cylinders_value":num_of_cylinders_value,
                  "columns": list(x.columns)}
encoded_columns

In [None]:
import json
with open("encoded_columns.json","w")as f:
    json.dump(encoded_columns,f)

In [None]:
symboling                    = 3
normalized_losses            = 115
make                         = "audi"        # dummy
fuel_type                    = "gas"         #replace
aspiration                   = "std"         #replace
num_of_doors                 =  "two"        #replace
body_style                   = "convertible" # dummy
drive_wheels                 = "rwd"         #onehot encoder
engine_location              ="rear"         #replace
wheel_base                   = 88.6
length                       = 168.8
width                        = 64.1
height                       = 48.8
curb_weight                  = 2548
engine_type                  = "dohc"        # dummy
num_of_cylinders             = "four"        #onehot encoder
engine_size                  = 14
fuel_system                  = "mpfi"        # dummy
bore                         = 3.47
stroke                       = 2.68
compression_ratio            = 9.0
horsepower                   = 111
peak_rpm                     = 5000
city_mpg                     = 21
highway_mpg                  = 27    
# price                      16500

In [None]:
# note - this columns should be below above columns which has assign input values
columns_1         =x.columns
engine_location_1 =engine_location_value[engine_location]
aspiration_1      =aspiration_value[aspiration]
num_of_doors_1   =num_of_doors_value[num_of_doors]
fuel_type_1       =fuel_type_value[fuel_type]       # ex...>>[fuel_type] input may be "gas" or "disel"
drive_wheels_1    =drive_wheels_value[drive_wheels]
num_of_cylinders_1=num_of_cylinders_value[num_of_cylinders]


In [None]:
columns_1

In [None]:
array = np.zeros(x.shape[1])
array[0]=symboling
array[1]=normalized_losses 
array[2]=fuel_type_1
array[3]=aspiration_1
array[4]=num_of_doors_1 
array[5]=drive_wheels_1 
array[6]=engine_location_1 
array[7]=wheel_base
array[8]=length 
array[9]=width 
array[10]=height 
array[11]=curb_weight 
array[12]=num_of_cylinders_1
array[13]=engine_size
array[14]=bore
array[15]=stroke 
array[16]=compression_ratio 
array[17]=horsepower
array[18]=peak_rpm
array[19]=city_mpg 
array[20]=highway_mpg 


In [None]:
make_x = "make_" + make    # user input
make_index= np.where(columns_1 == make_x)[0][0]
array[make_index] = 1


In [None]:
body_style_x = "body-style_" + body_style      #user input
body_style_index = np.where(columns_1 == body_style_x )[0][0]
array[body_style_index] =1

In [None]:
engine_type_x = "engine-type_" + engine_type      #user input
engine_type_index = np.where(columns_1 == engine_type_x )[0][0]
array[engine_type_index] =1

In [None]:
fuel_system_x = "fuel-system_" + fuel_system  #user input
fuel_system_index = np.where(columns_1 == fuel_system_x )[0][0]
array[fuel_system_index] =1

In [None]:
columns_1

In [None]:
lr_model.predict([array])[0]

In [None]:
# pickle file

import pickle
with open("lr_model.pkl","wb")as f:
    pickle.dump(lr_model,f)
    