# Sklearn

In [1]:
#Importing pandas and loading data
import pandas as pd
df = pd.read_csv('auto_mpg.csv')
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [2]:
#Dropping null values
df.dropna(inplace = True)
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
 7   origin        392 non-null    object 
 8   name          392 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 30.6+ KB


In [5]:
#Creating matrix of predictors
X = df.iloc[:, 1:8]
#Setting target
y = df.iloc[:, 0]
print(X)
print(y)


     cylinders  displacement  horsepower  weight  acceleration  model_year  \
0            8         307.0       130.0    3504          12.0          70   
1            8         350.0       165.0    3693          11.5          70   
2            8         318.0       150.0    3436          11.0          70   
3            8         304.0       150.0    3433          12.0          70   
4            8         302.0       140.0    3449          10.5          70   
..         ...           ...         ...     ...           ...         ...   
393          4         140.0        86.0    2790          15.6          82   
394          4          97.0        52.0    2130          24.6          82   
395          4         135.0        84.0    2295          11.6          82   
396          4         120.0        79.0    2625          18.6          82   
397          4         119.0        82.0    2720          19.4          82   

     origin  
0       usa  
1       usa  
2       usa  
3      

In [6]:
X = pd.get_dummies(X)
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
0,8,307.0,130.0,3504,12.0,70,0,0,1
1,8,350.0,165.0,3693,11.5,70,0,0,1
2,8,318.0,150.0,3436,11.0,70,0,0,1
3,8,304.0,150.0,3433,12.0,70,0,0,1
4,8,302.0,140.0,3449,10.5,70,0,0,1
...,...,...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,0,0,1
394,4,97.0,52.0,2130,24.6,82,1,0,0
395,4,135.0,84.0,2295,11.6,82,0,0,1
396,4,120.0,79.0,2625,18.6,82,0,0,1


In [7]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [8]:
#Applying standard scaler on the data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit_transform(X_train)
scale.transform(X_test);


In [9]:
#Importing and fitting the model on training set
from sklearn.linear_model import LinearRegression
reg = LinearRegression()


In [10]:
#Fitting the model on training data :
reg.fit(X_train, y_train)


LinearRegression()

In [11]:
#Checking the coefficient(slope) and intercepts.
m = reg.coef_
c = reg.intercept_
m,c


(array([-0.38946904,  0.02158376, -0.01237154, -0.00700083,  0.12954429,
         0.76774449,  0.83388251,  0.71982383, -1.55370633]),
 -16.20312300058094)

In [12]:
#Predicting the target: mpg against the predictors in the training data set
#Predicted data stored in y_pred_train
y_pred_train = reg.predict(X_train)


In [13]:
# Prediction Accuracy in terms of how close is the predicted value of target: mpg
# to the real value in training data set
from sklearn.metrics import r2_score
r2_S = r2_score(y_train, y_pred_train)
r2_S


0.8194239716903474

In [14]:
# Predicting the target: mpg against the predictors in the testing data set
# Predicted data stored in y_pred_test
y_pred_test = reg.predict(X_test)


In [15]:
# Prediction Accuracy in terms of how close the predicted value of target: mpg
# to the real value in testing data set
r2_S = r2_score(y_test, y_pred_test)
r2_S


0.8387519287083123