In [None]:
import pandas as pd
import numpy as np

In [None]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-02-car-price/data.csv'

In [None]:
!wget $data

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.head()

In [None]:
df.columns = df.columns = df.columns.str.lower().str.replace(' ','_')

In [None]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

In [None]:
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ','_')

In [None]:
df.head()

In [None]:
df.dtypes

#### Exploratory Data Analysis

In [None]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
    print(df[col].nunique())

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.histplot(df.msrp,bins = 50)

In [None]:
sns.histplot(df.msrp[df.msrp < 100000],bins=50)

In [None]:
np.log1p([1, 10, 100, 1000])

In [None]:
np.log([1 + 1, 10 + 1, 100 + 1, 1000 + 1])

In [None]:
price_logs = np.log1p(df.msrp)

In [None]:
price_logs

In [None]:
sns.histplot(price_logs, bins = 50)

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

##### Setting up validation framework

In [None]:
n = len(df)
print(n)

In [None]:
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
print(n_val, n_test, n_train)

In [None]:
n, n_val + n_test + n_train

In [None]:
df_val = df.iloc[:n_val]
df_test = df.iloc[n_val:n_val + n_test]
df_train = df.iloc[n_val + n_test:]

In [None]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train + n_val]
df_test = df.iloc[n_train+n_val:n_val+n_test]

In [None]:
idx = np.arange(n)
np.random.seed(2)
np.random.shuffle(idx)

In [None]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)

In [None]:
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']

In [None]:
len(y_train)

## 2.5 Linear regression

In [None]:
df_train.iloc[10]

In [None]:
xi = [453, 11, 86]

In [None]:
def g(xi):
    # do something
    return 10000

In [None]:
g(xi)

In [None]:
xi = [453, 11, 86]
w0 = 7.17
w = [0.01, 0.04, 0.002]

In [None]:
def linear_regression(xi):
    n = len(xi)
    
    pred = w0
    
    for j in range(n):
        pred = pred + w[j] * xi[j]
    return pred

In [None]:
price = linear_regression(xi)
price

In [None]:
np.expm1(price)

#### Linear regression vector form

In [None]:
def dot(xi, w):
    n = len(xi)
    res = 0.0
    for j in range(n):
        res = res + xi[j] * w[j]

    return res

In [None]:
def linear_regression(xi):
    return w0 + dot(xi, w)

In [None]:
price = linear_regression(xi)
price

In [None]:
w_new = [w0] + w
w_new

In [None]:
def linear_regression(xi):
    xi = [1] + xi
    return dot(xi, w_new)

In [None]:
price = linear_regression(xi)
price

In [None]:
xi = [453, 11, 86]
w0 = 7.17
w = [0.01, 0.04, 0.002]

In [None]:
x1 = [1, 148, 24, 1385]
x2 = [1, 132, 25, 2031] 
x10 = [1, 453, 11, 86]

X = [x1, x2, x10]
X = np.array(X)
X

In [None]:
X.dot(w_new)

In [None]:
def linear_regression(X):
    return X.dot(w_new)

In [None]:
linear_regression(X)

### 2.7 Training a Linear Regression Model

In [None]:
def train_linear_regression(X, y):
    pass

In [None]:
X = [
    [148, 24, 1385],
    [132, 25, 2031],
    [453, 11, 86],
    [158, 24, 185],
    [172, 25, 201],
    [413, 11, 86],
    [38,  54, 185],
    [142, 25, 431],
    [453, 31, 86],
]

X = np.array(X)
X