<a href="https://colab.research.google.com/github/oprostep/2023-spring/blob/main/linear_models_for_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
# start by importing all possibly useful libraries
# source: https://www.kaggle.com/code/patrickjellison/polynomial-regression/notebook

import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import preprocessing
from sklearn.svm import SVR

from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from statistics import mean

import xgboost as xgb

In [2]:
# import auto-mpg.csv

targetUrl = "https://raw.githubusercontent.com/oprostep/2023-spring/main/auto-mpg.csv"

df = pd.read_csv(targetUrl, sep=',')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
# Horsepower column has some non-numeric rows

df_horsepower_non_numeric = df[df['horsepower'].apply(lambda x: not x.isnumeric())]
df_horsepower_non_numeric

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


In [6]:
# Fill it with median values

df['horsepower'] = df['horsepower'].replace('?', np.nan)
df['horsepower'] = pd.to_numeric(df['horsepower'])
df['horsepower'] = df['horsepower'].transform(lambda x: x.fillna(x.median()) )

# remove categorical labels

df = df.drop(['origin', 'car name'], axis = 1)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


In [7]:
# Variable X and y
X = df.drop(['mpg'], axis = 1)
y = df['mpg']

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

In [15]:
# Build and Train the Model

model = LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4)
model.fit(X_train, y_train)

y_hat_train = model.predict(X_train)
print("predictions:", y_hat_train[:5])
print("labels:", list(y_train[:5]))

y_hat_test = model.predict(X_test)
print("MSE:", math.sqrt(mean_squared_error(y_test, y_hat_test)))
score = r2_score(y_test, y_hat_test)
print("R Squared:", score)

predictions: [19.63333917  6.24020066 30.48363183 19.99384556 28.69897459]
labels: [18.0, 12.0, 29.5, 15.0, 26.0]
MSE: 3.3113804324811587
R Squared: 0.8295169384211406


In [20]:
for i in range(1,6):
    polynomial_model = PolynomialFeatures(degree = i)
    poly_X = polynomial_model.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(poly_X, y, test_size = 0.2, random_state = 4)

    polynomial_model.fit(X_train, y_train)
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    y_hat_train = regression_model.predict(X_train)
    print("predictions:", y_hat_train[:5])
    print("labels:", list(y_train[:5]))

    y_hat_test = regression_model.predict(X_test)
    print("MSE:", math.sqrt(mean_squared_error(y_test, y_hat_test)))
    score = r2_score(y_test, y_hat_test)
    print("R Squared:", score)
    print("\n")

predictions: [19.63333917  6.24020066 30.48363183 19.99384556 28.69897459]
labels: [18.0, 12.0, 29.5, 15.0, 26.0]
MSE: 3.3113804324811498
R Squared: 0.8295169384211415


predictions: [17.6963724  13.65860251 30.7200584  16.80472661 28.71593859]
labels: [18.0, 12.0, 29.5, 15.0, 26.0]
MSE: 2.639675484442832
R Squared: 0.891666154946217


predictions: [18.05261682 12.71652304 28.4677931  17.08074813 27.36238277]
labels: [18.0, 12.0, 29.5, 15.0, 26.0]
MSE: 4.74009303350084
R Squared: 0.6506692543104808


predictions: [18.10277544 16.72219522 25.73394984 15.5897303  26.97140903]
labels: [18.0, 12.0, 29.5, 15.0, 26.0]
MSE: 15.297604568303726
R Squared: -2.6383957923275285


predictions: [19.16646793 11.37763083 35.08049317 18.26428604 27.35569758]
labels: [18.0, 12.0, 29.5, 15.0, 26.0]
MSE: 6836.569049742741
R Squared: -726672.7776974542


