In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# **Data Importing**

In [2]:
data = pd.read_csv("/kaggle/input/ann-car-sales-price-prediction/car_purchasing.csv",encoding='ISO-8859-1')

In [3]:
data.head()

Unnamed: 0,customer name,customer e-mail,country,gender,age,annual Salary,credit card debt,net worth,car purchase amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,Bulgaria,0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,Belize,0,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,Algeria,1,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,Cook Islands,1,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,Brazil,1,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer name        500 non-null    object 
 1   customer e-mail      500 non-null    object 
 2   country              500 non-null    object 
 3   gender               500 non-null    int64  
 4   age                  500 non-null    float64
 5   annual Salary        500 non-null    float64
 6   credit card debt     500 non-null    float64
 7   net worth            500 non-null    float64
 8   car purchase amount  500 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 35.3+ KB


In [5]:
data.describe()

Unnamed: 0,gender,age,annual Salary,credit card debt,net worth,car purchase amount
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.506,46.241674,62127.239608,9607.645049,431475.713625,44209.799218
std,0.500465,7.978862,11703.378228,3489.187973,173536.75634,10773.178744
min,0.0,20.0,20000.0,100.0,20000.0,9000.0
25%,0.0,40.949969,54391.977195,7397.515792,299824.1959,37629.89604
50%,1.0,46.049901,62915.497035,9655.035568,426750.12065,43997.78339
75%,1.0,51.612263,70117.862005,11798.867487,557324.478725,51254.709517
max,1.0,70.0,100000.0,20000.0,1000000.0,80000.0


In [6]:
data.duplicated().sum()

0

## **Data Preprocessing**

#### Feature Selection
-- Drop : customer name, customer email, country

In [7]:
data = data.drop(columns=['customer name', 'customer e-mail', 'country', 'gender'])
data.head()

Unnamed: 0,age,annual Salary,credit card debt,net worth,car purchase amount
0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


### X, y Split

In [8]:
X = data[data.columns[:-1]]
y = data[data.columns[-1]]
print(X, y)

           age  annual Salary  credit card debt    net worth
0    41.851720    62812.09301      11609.380910  238961.2505
1    40.870623    66646.89292       9572.957136  530973.9078
2    43.152897    53798.55112      11160.355060  638467.1773
3    58.271369    79370.03798      14426.164850  548599.0524
4    57.313749    59729.15130       5358.712177  560304.0671
..         ...            ...               ...          ...
495  41.462515    71942.40291       6995.902524  541670.1016
496  37.642000    56039.49793      12301.456790  360419.0988
497  53.943497    68888.77805      10611.606860  764531.3203
498  59.160509    49811.99062      14013.034510  337826.6382
499  46.731152    61370.67766       9391.341628  462946.4924

[500 rows x 4 columns] 0      35321.45877
1      45115.52566
2      42925.70921
3      67422.36313
4      55915.46248
          ...     
495    48901.44342
496    31491.41457
497    64147.28888
498    45442.15353
499    45107.22566
Name: car purchase amount, Length: 

In [9]:
y = y.values.reshape(-1,1)

In [10]:
r_scaler = RobustScaler()
X = r_scaler.fit_transform(X)
y = r_scaler.fit_transform(y)

In [11]:
X

array([[-0.39374091, -0.0065754 ,  0.44403299, -0.72927636],
       [-0.48575642,  0.23727733, -0.01864846,  0.40475213],
       [-0.27170548, -0.57974136,  0.342013  ,  0.82220126],
       ...,
       [ 0.74032809,  0.37983752,  0.2173358 ,  1.31177021],
       [ 1.22962347, -0.83324446,  0.99015013, -0.34533353],
       [ 0.06389349, -0.09823418, -0.05991204,  0.14056828]])

In [12]:
y

array([[-6.36803185e-01],
       [ 8.20372530e-02],
       [-7.86854207e-02],
       [ 1.71925875e+00],
       [ 8.74704018e-01],
       [ 9.25826579e-01],
       [-1.10622270e+00],
       [ 2.52274959e-01],
       [ 2.94743904e-01],
       [-4.26301423e-01],
       [ 1.10443565e+00],
       [-1.25430923e-01],
       [-1.12278601e+00],
       [ 3.86140502e-01],
       [ 4.04574358e-01],
       [ 6.61989528e-01],
       [-1.60226980e-01],
       [-7.05117543e-03],
       [ 4.66948792e-02],
       [ 7.94854231e-01],
       [ 5.23542609e-01],
       [-4.37046820e-02],
       [ 2.38027233e-01],
       [ 1.93596113e-01],
       [ 7.93125382e-02],
       [ 2.86096195e-02],
       [-5.01748481e-01],
       [ 3.73890510e-01],
       [ 1.05341146e+00],
       [-2.51281972e-04],
       [-1.93861104e+00],
       [ 4.78962403e-02],
       [ 1.64157181e+00],
       [ 6.96646509e-01],
       [-3.07338902e-01],
       [ 5.58859740e-01],
       [-3.68380002e-01],
       [-2.48851541e+00],
       [-5.9

### Train Test Split

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state = 42, test_size= 0.1)
X_test.shape

(50, 4)

## Machine Learning Algorithms

In [14]:
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)
lr_model.score(X_train, y_train)

0.9999999811362869

In [15]:
y_pred = lr_model.predict(X_test)


In [16]:
mse= mean_squared_error(y_test, y_pred)
mse

1.0840999396037733e-08

In [17]:
sgd_model = SGDRegressor(max_iter=3000, learning_rate='adaptive', random_state=42)
sgd_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [18]:
sgd_model.score(X_train, y_train)

0.9999999473807598

In [19]:
sgd_model.score(X_test, y_test)

0.9999999456264687

In [20]:
y_pred = sgd_model.predict(X_test)

In [21]:
print(mean_squared_error(y_pred, y_test))

2.999213973610697e-08
