In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler  
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

import statsmodels.api as sm
import statsmodels.formula.api as smf

https://github.com/mwaskom/seaborn-data
For others dataset

In [None]:
df = sns.load_dataset("iris")

In [None]:
df.head(5)

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.mean()

In [None]:
df.std()

In [None]:
df.std()/df.mean()

In [None]:
stats.zscore(df.sepal_length)

In [None]:
sns.pairplot(df, hue="species", height=2, aspect=1)

In [None]:
df.hist()
plt.show()

In [None]:
plt.figure(figsize = (10, 8))
plt.scatter(df.petal_length, df.petal_width);

# ML LR

In [None]:
LRModel = LinearRegression(fit_intercept=True)

In [None]:
LRModel

In [None]:
RSCModel = RANSACRegressor()

In [None]:
RSCModel

In [None]:
X = df.petal_length.values.reshape(-1,1)

In [None]:
y= df.petal_width.values

In [None]:
LRModel.fit(X, y)

In [None]:
LRModel.coef_

In [None]:
LRModel.intercept_

In [None]:
x_fit = np.linspace(0, 8)

In [None]:
X_fit = x_fit.reshape(-1,1)

In [None]:
X_fit

In [None]:
y_fit = LRModel.predict(X_fit)

In [None]:
y_fit

In [None]:
plt.figure(figsize = (10, 8))
plt.scatter(df.petal_length.values, df.petal_width.values)
plt.plot(x_fit, y_fit);

# BOSTON Housong Project

In [None]:
bost = pd.read_csv("housing.data", delim_whitespace=True, header= None)

In [None]:
bost.columns = ["CrimePerCapita",
"ResidZone",
"IndusAcre",
"CHARiver",
"NOXConcent",
"RoomAvg",
"Age",
"Dist_Center",
"Axs_HW",
"Tax",
"Educ_ratio",
"Blck",
"LowStatus",
"MedValue"]

In [None]:
bost.head()

In [None]:
bost.describe()

In [None]:
sns.pairplot(bost[["Educ_ratio", "Blck", "LowStatus","MedValue"]])

In [None]:
bost.corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(bost.corr(), annot=True)

In [None]:
ind= 5
# strong diff : 0 1 7 8 10 
# weak diff : 2 5 6 9 11 12
# exclude : 3 4
X = bost.iloc[:,ind].values.reshape(-1,1)
y = bost.MedValue.values

X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=0)
LRModel.fit(X_train,y_train)
RSCModel.fit(X_train,y_train)
inliers = RSCModel.inlier_mask_
outliers = np.logical_not(inliers)

r_range = np.arange(bost.iloc[:,ind].min(),bost.iloc[:,ind].max(),1)
LR_r_y= LRModel.predict(r_range.reshape(-1,1))
RCS_r_y= RSCModel.predict(r_range.reshape(-1,1))
LR_train_y=LRModel.predict(X_train)
RSC_train_y=RSCModel.predict(X_train)
LR_test_y=LRModel.predict(X_test)
RSC_test_y=RSCModel.predict(X_test)

print("Coef LR: ",LRModel.coef_)
print("Intercept LR: ",LRModel.intercept_)
print("Coef RSC: ",RSCModel.estimator_.coef_)
print("Intercept RSC: ",RSCModel.estimator_.intercept_)
print("MSE LR Train: ",mean_squared_error(y_train, LR_train_y));
print("MSE RSC Train: ",mean_squared_error(y_train, RSC_train_y));
print("MSE LR Test: ",mean_squared_error(y_test, LR_test_y));
print("MSE RSC Test: ",mean_squared_error(y_test, RSC_test_y));
print("R² LR Train: ",r2_score(y_train, LR_train_y));
print("R² RSC Train: ",r2_score(y_train, RSC_train_y));
print("R² LR Test: ",r2_score(y_test, LR_test_y));
print("R² RSC Test: ",r2_score(y_test, RSC_test_y));

plt.figure(figsize=(12,10))
plt.scatter(X_train[inliers], y_train[inliers], c="black", marker="*", label="Inliers")
plt.scatter(X_train[outliers], y_train[outliers], c="yellow", marker="*", label="Ouliers")
plt.plot(r_range,RSC_y, color="red")
plt.plot(r_range,LR_y, color="blue")
plt.xlabel("Medium Price")
plt.ylabel(bost.columns[ind])
plt.legend()
plt.figure(figsize=(12,8))
plt.scatter(LR_train_y, LR_train_y - y_train, c='blue', marker='o', label='Training data')
plt.scatter(LR_test_y, LR_test_y - y_test, c='orange', marker='*', label='Test data')
plt.xlabel('LM Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
#plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='k')
#plt.xlim([-10, 50])
plt.show()
plt.figure(figsize=(12,8))
plt.scatter(RSC_train_y, RSC_train_y - y_train, c='blue', marker='o', label='Training data')
plt.scatter(RSC_test_y, RSC_test_y - y_test, c='orange', marker='*', label='Test data')
plt.xlabel('RSC Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.show()

In [None]:
bost_const = sm.add_constant(bost.iloc[:,:-1])
bost_target = bost.iloc[:,-1:].values

In [None]:
bost_target

In [None]:
model = sm.OLS(bost_target, bost_const)
LR = model.fit()
LR.summary()

In [None]:
fmodel = smf.ols(formula = 'MedValue ~ CrimePerCapita + ResidZone + IndusAcre + CHARiver + RoomAvg + Age + Dist_Center + Axs_HW + Tax + Educ_ratio + Blck + LowStatus',
                data=bost)
LRf = fmodel.fit()
LRf.summary()

In [None]:
r2_score(bost_target, LRf.predict(bost))

In [None]:
pd.options.display.float_format = '{:,.4f}'.format
bost_corr = bost.corr()
bost_corr

In [None]:
bost_corr[np.abs(bost_corr) <= 0.7] = 0
bost_corr
plt.figure(figsize=(16,10))
sns.heatmap(bost_corr, annot=True, cmap='YlGnBu')
plt.show()

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(bost.corr())
pd.Series(eigenvalues).sort_values()

In [None]:
np.abs(pd.Series(eigenvectors[:,8])).sort_values(ascending=False)

In [None]:
print(bost.columns[2], bost.columns[8], bost.columns[9])

In [None]:
model = LinearRegression()
model.fit(bost.iloc[:,:-1],bost_target)
model.coef_

In [None]:
scaler = StandardScaler()  
Stand_coef_linear_reg = make_pipeline(scaler, model)

In [None]:
Stand_coef_linear_reg.fit(bost.iloc[:,:-1],bost_target)
Stand_coef_linear_reg.steps[1][1].coef_

In [None]:
X_boston = bost['Dist_Center'].values
y_boston = bost['NOXConcent'].values

In [None]:
lr = LinearRegression()
lr.fit(X_boston.reshape(-1, 1), y_boston)
model_pred = lr.predict(X_boston.reshape(-1,1))
plt.figure(figsize=(12,8))
plt.scatter(X_boston, y_boston);
plt.plot(X_boston, model_pred);
print("R^2 score = {:.2f}".format(r2_score(y_boston, model_pred)))


In [None]:
poly_reg = PolynomialFeatures(degree=3)
X_poly_b = poly_reg.fit_transform(X_boston.reshape(-1, 1))
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly_b, y_boston)

In [None]:
X_fit = np.arange(X_boston.min(), X_boston.max(), 1)[:, np.newaxis]

In [None]:
y_pred = lin_reg_2.predict(poly_reg.fit_transform(X_fit.reshape(-1,1)))

In [None]:
plt.figure(figsize=(10,8));
plt.scatter(X_boston, y_boston);
plt.plot(X_fit, y_pred);
print("R^2 score = {:.2f}".format(r2_score(y_boston, 
                                          lin_reg_2.predict(X_poly_b))))

# Seaborn Visualisation

In [None]:
sns.jointplot(x='LowStatus',y='MedValue',data=bost, kind='reg', height = 10)

In [None]:
sns.regplot(X,y)