In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [3]:
boston = pd.read_csv('../input/boston.csv')

In [4]:
boston.head()

In [5]:
boston.keys()

In [6]:
boston.info()

In [7]:
boston.describe() # For all numerical columns

In [8]:
sns.set_palette("GnBu_d")
sns.set_style('whitegrid')
sns.pairplot(boston,size = 2,vars = ['CRIM','NOX','RAD','DIS','MEDV'])

From the pairplot, we can see that the MEDV value i.e. Median value of owner occupied homes is low in areas where crime rate is high and where NOX i.e. Nitrous oxide concentration is high 

In [9]:
#Let's check the distribution plot of MEDV value 
sns.distplot(boston['MEDV'],bins = 20)

In [10]:
ax = plt.subplots(figsize = (14,6))
sns.heatmap(boston.corr(),cmap = 'magma',linecolor = 'white',lw = 1)

In [11]:
boston.corr()

From the correlation table and heatmap we can see that few variables are highly correlated, like RAD and TAX with a correlation value of 0.91. RAD is also highly negatively corelated with TRACT and TAX too. Let's plot them and see.

In [12]:
sns.jointplot(x='RAD',y='TAX',data=boston,kind='scatter')

In [13]:
sns.jointplot(x='RAD',y='TRACT',data=boston,kind='scatter')

In [14]:
sns.jointplot(x='TAX',y='TRACT',data=boston,kind='scatter')

Consider above three plots, we can consider only one variable out of RAD and TAX, we will see that while we train adn test our model

In [15]:
#Training the Linear Model
boston.columns

In [17]:
X = boston[['CRIM', 'ZN', 'INDUS',
       'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'PTRATIO']]
y = boston['MEDV']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)

In [18]:
# print the intercept
print(lm.intercept_)

In [19]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

From the coefficient , we can definitely see that for every one unit increase in Nitrous Oxide concentration, there is a decrease of approx 1700$ in the MEDV price. This could be a reason for the low prices in those regions.

In [20]:
Predictions = lm.predict(X_test)
# Let's check through a scatter plot how they are aligned
plt.scatter(y_test,Predictions)

In [21]:
#residual Histogram
sns.distplot((y_test-Predictions),bins=50)

In [22]:
#Regression Evaluation Metrics
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, Predictions))
print('MSE:', metrics.mean_squared_error(y_test, Predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, Predictions)))