In [5]:
import pandas as pd
import numpy as np


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [7]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower',
    'weight', 'acceleration', 'model_year', 'origin', 'car_name'
]

df = pd.read_csv(
    url,
    delim_whitespace=True,
    names=column_names,
    na_values='?'
)


In [8]:
df.head()


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [9]:
df.isnull().sum()


mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [10]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].mean())


In [11]:
X = df.drop(['mpg', 'car_name'], axis=1)
y = df['mpg']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [13]:
model = LinearRegression()


In [14]:
model.fit(X_train, y_train)


LinearRegression()

In [15]:
model.intercept_


-20.782175730121402

In [16]:
model.coef_


array([-0.15417994,  0.01399743, -0.01179845, -0.00677523,  0.07488864,
        0.79647938,  1.31331307])

In [17]:
pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})


Unnamed: 0,Feature,Coefficient
0,cylinders,-0.15418
1,displacement,0.013997
2,horsepower,-0.011798
3,weight,-0.006775
4,acceleration,0.074889
5,model_year,0.796479
6,origin,1.313313


In [18]:
y_pred = model.predict(X_test)


In [22]:
model.predict([[8, 307, 130, 3504, 12, 70, 1]])





array([14.97294014])

In [23]:
X.columns



Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model_year', 'origin'],
      dtype='object')

In [24]:
model.predict([[8, 307, 130, 3504, 12, 70, 1]])





array([14.97294014])

In [25]:
test_car = pd.DataFrame(
    [[8, 307, 130, 3504, 12, 70, 1]],
    columns=X.columns
)

model.predict(test_car)


array([14.97294014])

In [26]:
model.intercept_


-20.782175730121402

In [27]:
coef_df = pd.DataFrame(
    model.coef_,
    index=X.columns,
    columns=["Coefficient"]
)
coef_df


Unnamed: 0,Coefficient
cylinders,-0.15418
displacement,0.013997
horsepower,-0.011798
weight,-0.006775
acceleration,0.074889
model_year,0.796479
origin,1.313313


In [28]:
y_pred = model.predict(X_test)


In [30]:
pd.DataFrame({
    "Actual MPG": y_test.values[:5],
    "Predicted MPG": y_pred[:5]
})



Unnamed: 0,Actual MPG,Predicted MPG
0,33.0,32.863457
1,28.0,29.581302
2,19.0,21.351261
3,13.0,16.805687
4,14.0,12.501362


In [31]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)


0.8475731044779433