In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
df = pd.read_csv('../data/cars_clean.csv')

In [None]:
#sanity checks
print(df.head())
print(df.tail())
print(df.shape)
print(df.size)
print(df.info())
print(df.describe())

In [None]:
#correlations
df.corr(numeric_only = True)

In [None]:
#find the correlation among a subset of columns
df[['length', 'width', 'curb-weight', 'engine-size', 'wheel-base']].corr(numeric_only = True)

In [None]:
#seaborn to make visuals

sns.regplot(x = 'engine-size', y = 'price', data = df)
plt.ylim(0)

In [None]:
sns.regplot(x = 'highway-mpg', y = 'price', data = df)

In [None]:
df.describe(include = ['object'])

In [None]:
sns.boxplot(x = 'drive-wheels', y = 'price', data = df)

In [None]:
sns.boxplot(x = 'body-style', y = 'price', data = df)

In [None]:
sns.boxplot(x = 'engine-location', y = 'price', data = df)

In [None]:
sns.boxplot(x = 'price', y = 'make', data = df)
#rotate label if too long
plt.xticks(rotation=90)
plt.yticks(rotation=0)

In [None]:
#descriptive statistical analysis
df['drive-wheels'].value_counts()

In [None]:
#convert to dataframe

drive_wheels_counts = df['drive-wheels'].value_counts().to_frame()
drive_wheels_counts.rename(columns = {'drive-wheels': 'drive_wheels_counts'}, inplace = True)
drive_wheels_counts.index.name = 'drive-wheels'
drive_wheels_counts

In [None]:
#grouping data

group_1 = df[['make', 'price']]
group_1

grouped_mean_1 = group_1.groupby(['make'], as_index = False).mean()
grouped_mean_1

In [None]:
# create another group
group_2 = df[['drive-wheels', 'engine-location', 'price']]
group_2

grouped_mean_2 = group_2.groupby(['drive-wheels', 'engine-location'], as_index = False).mean()
grouped_mean_2

In [None]:
#Pivoting the table
grouped_pivot_1 = grouped_mean_2.pivot(index = 'drive-wheels', columns = 'engine-location')
grouped_pivot_1 = grouped_pivot_1.fillna(0)
grouped_pivot_1

In [None]:
#restructure table
grouped_pivot_2 = grouped_mean_2.pivot(index = 'engine-location', columns = 'drive-wheels')
grouped_pivot_2 = grouped_pivot_2.fillna(0)
grouped_pivot_2

In [None]:
#p-values
#the probability that assuming the null hypothesis is true, we obtain a test statistic at least as extereme as the results we observed

# p-value < 0.001, strong evidence that the correlation coefficient is statistically significant
# p-value the probability that the correlation coefficient is statistically significant
# p-value < 0.05, there is moderate evidence that the correlation coefficient is statistically significant
# p-value < 0.1, there is weak evidence that the correlation coefficient is statistically significant, or no evidence at all

In [None]:
#import the stats library
from scipy import stats

In [None]:
#computer the correlation coefficient and the p-value
p_coef, p_value = stats.pearsonr(df['engine-size'], df['price'])
print('The Pearson Correlation Coefficient is', p_coef, 'with a P-value of P =', p_value)

In [None]:
#analysis of variance (ANOVA)
#Analysis of Variance (ANOVA) is a statistical method used to test whether there are significant differences between the means of two or more groups.
#ANOVA returns two parameters
# F-test score
# P-value
# If our price is strongly correlated with the variable that we are anlayzing, then we will expect that ANOVA will return a large F-test score and a small p-value


In [None]:
#check group 2
print(group_2)

In [None]:
#make another group
group_3 = df[['body-style', 'price']]
grouped_drive_wheels = group_2[['drive-wheels', 'price']].groupby(['drive-wheels'], as_index = False)
mean_price_by_drive_wheels = grouped_drive_wheels.mean()
mean_price_by_drive_wheels

In [None]:
grouped_drive_wheels.get_group(('4wd',))['price']

In [None]:
# Perform ANOVA on the grouped data
f_val, p_val = stats.f_oneway(grouped_drive_wheels.get_group(('4wd',))['price'],
                              grouped_drive_wheels.get_group(('fwd',))['price'],
                              grouped_drive_wheels.get_group(('rwd',))['price'])

print("ANOVA results: F-value =", f_val, ", P-value =", p_val)

In [None]:
## linear regression model
#y = a + b * x

In [None]:
#import libraries
from sklearn.linear_model import LinearRegression

In [None]:
#create the linear regression object
lm = LinearRegression()

In [None]:
#set up our input parameters for the Linear Regression Model
X = df[['engine-size']]
Y = df['price']

In [None]:
# fit the linear model
lm.fit (X,Y)

In [None]:
#print the intercept
print('The intercept is', lm.intercept_)

In [None]:
print(lm.coef_)

In [None]:
print(lm.intercept_)

In [None]:
#print the slope of the coefficient
print('The slope of the coefficient is', lm.coef_)

In [None]:
# Our linear regression model is y = 166.86001569x -7963.338906281042
# price = -7963.338 + 166.86 * engine-size

In [None]:
#visual check
sns.regplot(x = 'engine-size', y = 'price', data = df)
plt.ylim(0)

In [None]:
#create a residual plot
sns.residplot(x=df['engine-size'], y=df['price'])

In [None]:
#coefficient of determination
#compute the coefficient of determination

In [None]:
#compute the coefficient of determination
print('The R-square is: ', lm.score(X, Y))

In [None]:
# multiple linear regression

In [None]:
#define other parameters
z= df[['engine-size', 'curb-weight']]

In [None]:
#create the model
lm.fit(z, Y)

In [None]:
#compute the coefficient of determination
print('The R-square is: ', lm.score(z, Y))