In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/kaggle/input/chapter-2-carpricedata/data.csv", encoding="ISO-8859-1")

In [None]:
df.head()

In [None]:
# update column names to be lowercase and snake case
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

In [None]:
# Now we want to change the data in columns to make them consistent (lowercase and snake case)
# step 1 is to find which data values are string objects
df.dtypes

In [None]:
df.dtypes == 'object'
# we just want the index of the ones that are strings. 
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

In [None]:
# this is how it is done for one of them.
df['make'].str.lower().str.replace(' ', '_')

In [None]:
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    

df.head()

In [None]:
df.dtypes

**Data analyis**

In [None]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
#     tells us how many unique columns there are
    print(df[col].nunique())
    print()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.histplot(df.msrp[df.msrp < 100000], bins=50)

In [None]:
 np.log([0+ 1, 1+ 1, 10 + 1, 1000 + 1, 100000+ 1])

In [None]:
np.log1p([0,1,10,1000,100000])

In [None]:
price_logs = np.log1p(df.msrp)
price_logs
# we apply logarithmic distribution to get rid of the LongTail Distribution

In [None]:
sns.histplot(price_logs, bins=50)
#now we achieve normal distribution

In [None]:
df.isnull().sum()
# find the number of missing values. Keep this in mind for when we train the model.

**Setting up the Validation Framework**

In [None]:
n = len(df)
n_val = int(n * .2)
n_test = int(n * .2)
n_train = n - n_val - n_test
print(n, n_val + n_test + n_train)

In [None]:
# create a range of size n-1
idx = np.arange(n)
idx

In [None]:
# Now we shuffle with array of index we just 
# in order to make our results reproduceable, we want to set the seed
np.random.seed(2)


In [None]:
np.random.shuffle(idx)
idx
# The data set it different from the video because we have a different NumPy version

In [None]:
# we are getting the n_train value out of the idx shuffled array, which will than pull those indeces from our df
# iloc is used to select attributes from the DataFrame by their integer location, which we have in the idx array
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_val+n_train]]
df_test = df.iloc[idx[n_val+n_train:]]
# Important to shuffle the data so it's not sequential
df_train

In [None]:
# the index values are all random in the new dataset. 
# we can use reset_index to reset them
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# these are all the feature Matrix X


In [None]:
#Now to define our Y. which is the msrp
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)


In [None]:
# Delete the MSRP from the DF
# we don't want the target variable in the Dataframe to test the model.
del df_train['msrp']
del df_test['msrp']
del df_val['msrp']