In [1222]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

In [1223]:
df_train = pd.read_csv('car_price_train.csv')
df_test = pd.read_csv('car_price_test.csv')

print(df_train.iloc[104, :])
df_train = df_train.drop([104])
df_test.shape

Car_Name            NaN
Year               2.95
Selling_Price      7.74
Present_Price     49998
Kms_Driven          CNG
Fuel_Type        Dealer
Seller_Type      Manual
Transmission          0
Owner               NaN
Name: 104, dtype: object


(61, 9)

In [1224]:
def splitdata(dataframe): 
    feature_columns = ["Car_Name", "Year", "Present_Price", "Kms_Driven", "Fuel_Type", "Seller_Type", "Transmission", "Owner"]
    label_column = "Selling_Price"
    features = dataframe.loc[:, feature_columns]
    label = dataframe.loc[:, label_column]
    return features.values, label.values

X_train, y_train = splitdata(df_train)
X_test, y_test = splitdata(df_test)

In [1225]:
df_train.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,Bajaj Pulsar 150,2008.0,0.25,,,Petrol,,Manual,1.0
1,Bajaj Avenger 220,2017.0,0.75,0.95,3500.0,Petrol,Individual,Manual,0.0
2,Hero CBZ Xtreme,2008.0,0.2,0.787,,Petrol,Individual,Manual,0.0
3,etios g,2015.0,3.95,6.8,36000.0,Petrol,Dealer,Manual,0.0
4,Royal Enfield Thunder 350,2016.0,1.15,1.5,8700.0,Petrol,Individual,Manual,0.0


# missing value

In [1226]:
car_name_impute = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 0] = car_name_impute.fit_transform(X_train[:, 0].reshape(-1, 1)).ravel()

In [1227]:
present_price_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 2] = present_price_imputer.fit_transform(X_train[:, 2].reshape(-1, 1)).ravel()

In [1228]:
kms_driven_imputer = SimpleImputer(np.nan, strategy = 'mean')
X_train[:, 3] = present_price_imputer.fit_transform(X_train[:, 3].reshape(-1, 1)).ravel()

In [1229]:
value_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 4:8] = value_imputer.fit_transform(X_train[:, 4:8])

In [1230]:
feature_columns = ["Car_Name", "Year", "Present_Price", "Kms_Driven", "Fuel_Type", "Seller_Type", "Transmission", "Owner"]
new_df = pd.DataFrame.from_records(X_train, columns=feature_columns)
print(new_df.isnull().sum())

Car_Name         0
Year             0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64


In [1231]:
def normalize(X):
    label_encoder = LabelEncoder()
    minmax_scaler = MinMaxScaler()
    
    #Year
    X[:, 1] = minmax_scaler.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    # Kms_Driven
    X[:, 3] = minmax_scaler.fit_transform(X[:, 3].reshape(-1, 1)).ravel()
    #Seller_Type
    X[:, 5] = label_encoder.fit_transform(X[:, 5])
    #Transmission
    X[:, -2] = label_encoder.fit_transform(X[:, -2]) 
    #Fuel_Type
    X[:, 4] = label_encoder.fit_transform(X[:, 4])
    return X

X_train = normalize(X_train)
X_test = normalize(X_test)



encoder = OneHotEncoder()

Z = np.concatenate((X_train[:, 0], X_test[:, 0]), axis=0)
onehot = encoder.fit_transform(Z[:].reshape(-1,1)).toarray()
onehot_test = onehot[X_train[:, 0].shape[0]:]
onehot_train = onehot[:X_train[:, 0].shape[0]]

X_test = np.delete(X_test, 0, axis=1)
X_test = np.concatenate((onehot_test, X_test), axis=1)

X_train = np.delete(X_train, 0, axis=1)
X_train = np.concatenate((onehot_train, X_train), axis=1)




In [1232]:
linear = LinearRegression()
linear.fit(X_train, y_train)

predict_train = linear.predict(X_train)
print("Root Mean Square Error Train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
predict_test = linear.predict(X_test)
print("Root Mean Square Error Test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error Train: 1.2613429310843947
Root Mean Square Error Test: 133861093715.39224


In [1233]:
ridge = Ridge(alpha = 0.1)
ridge.fit(X_train, y_train)

predict_train = ridge.predict(X_train)
print("Root Mean Square Error Train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))

predict_test = ridge.predict(X_test)
print("Root Mean Square Error Test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error Train: 1.2691238753170988
Root Mean Square Error Test: 1.534703664130626
