In [1]:
import pandas as pd

In [2]:
url =  'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data' 

In [3]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin' ]

In [4]:
df = pd.read_csv(url, names=column_names, na_values= "?", comment='\t', sep = " ", skipinitialspace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB


In [6]:
# Note : Out of 398 only Horsepower contains 392 non-null values, means 6 null values present 

In [7]:
df.head(3)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1


### Check for Missing Values

In [8]:
df.isnull()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
393,False,False,False,False,False,False,False,False
394,False,False,False,False,False,False,False,False
395,False,False,False,False,False,False,False,False
396,False,False,False,False,False,False,False,False


In [9]:
df.isnull().values.any()

True

In [10]:
df.isnull().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

#### Drop NA rows

In [11]:
df = df.dropna()
df = df.reset_index(drop=True)

In [12]:
df.info()    # dropna -> dropped those 6 null value rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           392 non-null    float64
 1   Cylinders     392 non-null    int64  
 2   Displacement  392 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        392 non-null    float64
 5   Acceleration  392 non-null    float64
 6   Model Year    392 non-null    int64  
 7   Origin        392 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 24.6 KB


### Train Test Split

In [13]:
import sklearn
import sklearn.model_selection

In [14]:
df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8, random_state=1)

In [15]:
df_train

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
334,27.2,4,135.0,84.0,2490.0,15.7,81,1
258,18.6,6,225.0,110.0,3620.0,18.7,78,1
139,29.0,4,98.0,83.0,2219.0,16.5,74,2
310,37.2,4,86.0,65.0,2019.0,16.4,80,3
349,33.0,4,105.0,74.0,2190.0,14.2,81,2
...,...,...,...,...,...,...,...,...
203,28.0,4,97.0,75.0,2155.0,16.4,76,3
255,19.4,6,232.0,90.0,3210.0,17.2,78,1
72,13.0,8,307.0,130.0,4098.0,14.0,72,1
235,30.5,4,98.0,63.0,2051.0,17.0,77,1


In [16]:
df_train.shape

(313, 8)

In [17]:
df_test

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
81,23.0,4,120.0,97.0,2506.0,14.5,72,3
165,29.0,4,97.0,75.0,2171.0,16.0,75,3
351,32.4,4,108.0,75.0,2350.0,16.8,81,3
119,19.0,4,121.0,112.0,2868.0,15.5,73,2
379,38.0,4,91.0,67.0,1995.0,16.2,82,3
...,...,...,...,...,...,...,...,...
23,26.0,4,121.0,113.0,2234.0,12.5,70,2
293,35.7,4,98.0,80.0,1915.0,14.4,79,1
13,14.0,8,455.0,225.0,3086.0,10.0,70,1
90,13.0,8,400.0,150.0,4464.0,12.0,73,1


In [18]:
df_test.shape

(79, 8)

In [19]:
df_train.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
count,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0
mean,23.404153,5.402556,189.51278,102.929712,2961.198083,15.704473,75.929712,1.591054
std,7.666909,1.701506,102.675646,37.919046,848.602146,2.725399,3.675305,0.807923
min,9.0,3.0,68.0,46.0,1613.0,8.5,70.0,1.0
25%,17.5,4.0,104.0,75.0,2219.0,14.0,73.0,1.0
50%,23.0,4.0,140.0,92.0,2755.0,15.5,76.0,1.0
75%,29.0,8.0,260.0,120.0,3574.0,17.3,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [20]:
train_stats = df_train.describe().transpose()

In [21]:
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
Model Year,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


In [22]:
train_stats.shape

(8, 8)

In [23]:
df.columns

Index(['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
       'Acceleration', 'Model Year', 'Origin'],
      dtype='object')

In [24]:
df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [25]:
# numeric_column_names = df.loc[0:4, 'Cylinders':'Acceleration']

In [26]:
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

In [27]:
numeric_column_names

['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

In [28]:
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

In [29]:
for col_name in numeric_column_names:
    mean = train_stats.loc[col_name, 'mean']
    std = train_stats.loc[col_name, 'std']
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean) / std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean) / std

In [30]:
df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


In [31]:
df_train_norm.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
334,27.2,-0.824303,-0.530922,-0.499214,-0.555264,-0.001641,81,1
258,18.6,0.351127,0.345625,0.186457,0.776338,1.099115,78,1
139,29.0,-0.824303,-0.89128,-0.525586,-0.874613,0.291894,74,2
310,37.2,-0.824303,-1.008153,-1.000281,-1.110294,0.255202,80,3
349,33.0,-0.824303,-0.823104,-0.762934,-0.908786,-0.552019,81,2


In [32]:
df_train_norm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 313 entries, 334 to 37
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           313 non-null    float64
 1   Cylinders     313 non-null    float64
 2   Displacement  313 non-null    float64
 3   Horsepower    313 non-null    float64
 4   Weight        313 non-null    float64
 5   Acceleration  313 non-null    float64
 6   Model Year    313 non-null    int64  
 7   Origin        313 non-null    int64  
dtypes: float64(6), int64(2)
memory usage: 22.0 KB


### Lets work on Model year column
#### We are going to split the model year range into 4 parts:
$$ bucket = 0 , if year < 73 $$
        $$ = 1 , if 73 <= year < 76 $$
        $$ = 2 , if 76 <= year < 79 $$
        $$ = 3 , if year >= 76 $$

In [33]:
!pip install torch
# import torch

Defaulting to user installation because normal site-packages is not writeable


In [37]:
import torch

ModuleNotFoundError: No module named 'torch'

In [38]:
 boundaries = torch.tensor([73,76,79])

NameError: name 'torch' is not defined