In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

In [None]:
df = pd.read_csv('../data/cars_clean.csv')
display(df.head())

Sanity Checks

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df[['length','width','curb-weight','engine-size']].corr()

In [None]:
df.corr(numeric_only=True)

In [None]:
sns.regplot(x='price', y='horsepower', data=df)

In [None]:
sns.boxplot(x='body-style', y='price', data=df)

In [None]:
sns.countplot(x='body-style', data=df)

In [None]:
sns.boxplot(x='price', y='make', data=df)

In [None]:
df['drive-wheels'].value_counts()

In [None]:
data_drive_wheels_counts = df['drive-wheels'].value_counts().to_frame()
data_drive_wheels_counts.rename(columns={'drive-wheels': 'value_counts'}, inplace=True)
data_drive_wheels_counts.index.name = 'drive-wheels'
data_drive_wheels_counts

In [None]:
group_1 = df[['make','price']]
group_1
grouped_mean_1 = group_1.groupby(['make'], as_index=False).mean()
grouped_mean_1

In [None]:
group_2 = df[['drive-wheels','engine-location','price']]
grouped_mean_2 = group_2.groupby(['drive-wheels', 'engine-location'], as_index=False).mean()
grouped_mean_2

In [None]:
grouped_pivot_1 = grouped_mean_2.pivot(index='drive-wheels', columns='engine-location')
grouped_pivot_1.fillna(0, inplace=True)
grouped_pivot_1

In [None]:
grouped_pivot_2 = grouped_mean_2.pivot(index='engine-location', columns='drive-wheels')
grouped_pivot_2.fillna(0, inplace=True)
grouped_pivot_2

In [None]:
# The p-value is the probability of obtaining results at
# least as extreme as the observed results, assuming the null hypothesis is true.

# P-Value < 0.001: Strong Evidence that correlation coefficient is statisticallly significant
# P-Value < 0.05: Moderate Evidence that correlation coefficient is statisticallly significant
# P-Value < 0.1: Weak Evidence that correlation coefficient is statisticallly significant
# P-Value > 0.1: No Evidence that correlation coefficient is statisticallly significant

In [None]:
%pip install scipy
%pip install altair
%pip install plotly

In [None]:
from scipy import stats
import pandas as pd
p_coef, p_val = stats.pearsonr(df['engine-size'], df['price'])
p_val

Analysis of Variance (ANOVA)
- Statistical Method to Test whether there are Significant Differences between the Means of Two or More Groups
- ANOVA returns two parameters
    1.   F-test score
    2.   P-Value
- Price Strongly Correlated with Variable that we are analyzing then we expect the ANOVA to return a large F-Test score and a small P-Value.



In [None]:
group_2

In [None]:
group_3 = group_2[['drive-wheels', 'price']].groupby(['drive-wheels'], as_index=False).mean()
group_3

In [None]:
group_3_4wd = group_3[group_3['drive-wheels'] == '4wd']['price']
display(group_3_4wd)

In [None]:
f_val, p_val = stats.f_oneway(
  group_3[group_3['drive-wheels'] == 'fwd']['price'],
  group_3[group_3['drive-wheels'] == '4wd']['price'],
  group_3[group_3['drive-wheels'] == 'rwd']['price'])

f_val, p_val

Linear Regression Model
y = a + b * x

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [None]:
X = df[['engine-size']]
Y = df['price']
lm.fit(X, Y)

price = -7983 + 167 * engine-size

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

lm.fit(X_train, Y_train)
y_pred = lm.predict(X_test)

# Evaluate the model using appropriate regression metrics
print("R2 Score:", r2_score(Y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(Y_test, y_pred))

In [None]:
sns.regplot(x='engine-size', y='price', data=df)
plt.ylim(0,)

In [None]:
sns.residplot(x='engine-size', y='price', data=df)

In [None]:
lm.score(X, Y)

Mulitple Linear Regression

In [None]:
z = df[['curb-weight', 'engine-size']]
lm.fit(z, Y)

In [None]:
lm.score(z, Y)

In [None]:
lm.intercept_

In [None]:
lm.coef_
df.info()

Multiple Linear Regression Model

price = -14230 + 112 * engine-size + 5 * curb-weight

In [None]:
z = df[['curb-weight', 'engine-size', 'horsepower']]
lm.fit(z, Y)
p_coef, p_val = stats.pearsonr(df['horsepower'], df['price'])
p_val

In [None]:
lm.score(z, Y)


In [None]:
print('Coef: ', lm.coef_, " Intecept:", lm.intercept_)

Higher P-Value, Lower R^2 coefficient of determination

In [None]:
z = df[['curb-weight', 'engine-size', 'engine-size']]
lm.fit(z, Y)
p_coef, p_val = stats.pearsonr(df['engine-size'], df['price'])
p_val

In [None]:
lm.score(z, Y)

In [None]:
print('Coef: ', lm.coef_, " Intecept:", lm.intercept_)

Fit Quadratic Model

In [None]:
def PlotPoly(model, x, y, Name):
  x_new = np.linspace(1400, 5000, 1000)
  y_new = model(x_new)

  plt.plot(x, y, '.', x_new, y_new, '-')
  plt.title('Polynomial Fit for Price ~ Engine Size')
  ax = plt.gca()
  ax.set_facecolor((0.898, 0.898, 0.898))
  plt.xlabel(Name)
  plt.ylabel('Price')
  plt.show()

In [None]:
x = df['curb-weight']
y = df['price']

f = np.polyfit(x, y, 3)
p = np.poly1d(f)
print(p)

PlotPoly(p, x, y, 'curb-weight')