In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
wine1 = pd.read_csv("winemag-data_first150k.csv")

In [None]:
wine1.head(10)

In [None]:
# Reading the second wine dataset
wine2 = pd.read_csv("winemag-data-130k-v2.csv")

In [None]:
wine2.head(10)

In [None]:
# Check missing values
print(wine1.isna().sum().sort_values())

In [None]:
# Drop all NAs in price and points in wine1 dataset
wine1= wine1.dropna(subset=["country","province","price"])
print(wine1.isna().sum().sort_values())

In [None]:
# Prepare data: Need 2 arrays, each for response and feature variable with compatible shapes
# scikit-learn requires response and feature in distinct variables, X and y
X1_points = wine1["points"].values
y1_price = wine1["price"].values

In [None]:
print(type(X1_points), type(y1_price))

In [None]:
print(X1_points.shape, y1_price.shape)

In [None]:
# Ok for y to be 1-dimensional array
# But features must be formatted as a 2-dimensional array to be accepted by scikitlearn
# Convert shape of X by applying .reshape method, passing -1 followed by 1
X1_points = X1_points.reshape(-1, 1)
print(X1_points.shape)            

In [None]:
# Plot scatter plot
plt.scatter(X1_points, y1_price)
plt.ylim(0,1000)
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Create an instance of the linear regression model
model = LinearRegression()

In [None]:
model.fit(X1_points, y1_price)

In [None]:
# Analyze the coefficients and intercept of the fitted model
coefficients = model.coef_
print('coefficients: ',coefficients)
intercept = model.intercept_
print('intercept: ',intercept)

In [None]:
# Intercept is -422 and slope is 5.18 --> same results using statsmodels
# Equation is y = 5.18 * points - 422

In [None]:
# Check missing values in wine2 dataset
print(wine2.isna().sum().sort_values())

In [None]:
# Drop NAs in price and points in wine2 dataset
wine2= wine2.dropna(subset=["variety","country","province","price"])
print(wine2.isna().sum().sort_values())

In [None]:
X2_points = wine2["points"].values
y2_price = wine2["price"].values

In [None]:
print(type(X2_points), type(y2_price))

In [None]:
# Format into a 2-dimensional array to be accepted by scikitlearn
X2_points = X2_points.reshape(-1, 1)
print(X2_points.shape,y2_price.shape)

In [None]:
y2_predict = model.predict(X2_points)

In [None]:
plt.scatter(X2_points, y2_price, color="blue")
plt.plot(X2_points, y2_predict, color="purple")
plt.ylabel("Price")
plt.xlabel("Points")
plt.ylim(0,1000)
plt.show()

In [None]:
# Evaluate model performance
from sklearn.metrics import mean_squared_error

In [None]:
# Calculate R-squared
rsquared = model.score(X1_points, y1_price)
print("rsquared: ",rsquared)

# The proportion of the variance in the response variable that is predictable from the explanatory variable
# r-squared is 0.211
# About 22% of variance in price is explained by points
# Similar to what we got using statsmodels

In [None]:
# Calculate rmse
rmse = mean_squared_error(y2_price, y2_predict, squared=False)
print("rmse: ",rmse)

# RSE: measure of the typical size of the residuals, how much the predictions are typically wrong
# rse of 37.3 means that the difference between the predicted value of price and actual price is typically about 37.3
# The model has an average error for price of around $37 per point
# Close to what we have calculated in my previous video