Example of diffeerent data with same mean median mode 

In [1]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 
import os 
import plotly.express as ps

In [2]:
pwd = os.getcwd()
filepath = os.path.join(pwd, "housing.csv")
filepath

'c:\\Users\\Admin\\Videos\\work\\Portfolio\\8.Data_pipeline\\4.Data_modeling\\1.Chapter_2\\housing.csv'

In [3]:
housing_data = pd.read_csv(filepath)

In [4]:
housing_data['income_category'] = pd.cut(housing_data['median_income'],
                                              bins=[0., 1.5,3.0,4.5,6., np.inf], 
                                              labels=[1,2,3,4,5])

In [5]:
y = housing_data["median_house_value"]
X = housing_data.drop("median_house_value", axis=1)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

In [9]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_data, housing_data["income_category"]):
    strat_train_set = housing_data.loc[train_index]
    strat_test_set = housing_data.loc[test_index]

In [10]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [11]:
def feature_engineering(data):
    data["bedrooms_per_household"] = data["total_bedrooms"] / data["households"]
    data["population_per_household"] = data["population"] / data["households"]
    data["rooms_per_household"] = data["total_rooms"] / data["households"]

    return data

In [12]:
def data_transformations(data):

    ### Separate Labels if they Exist ###

    if "median_house_value" in data.columns:
        labels = data["median_house_value"]
        data = data.drop("median_house_value", axis=1)
        
    else:
        labels = None

    ### Feature Engineering ###
    feature_engineered_data = feature_engineering(data)
    features = list(feature_engineered_data.columns) # Creating a list of our features for future use

    ### Imputing Data ###
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy="median")

    housing_num = feature_engineered_data.select_dtypes(include=[np.number])
    imputed = imputer.fit_transform(housing_num)

    ### Encoding Categorical Data ###
    housing_cat = feature_engineered_data.select_dtypes(exclude=[np.number])

    from sklearn.preprocessing import OneHotEncoder
    cat_encoder = OneHotEncoder(sparse=False)
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
    features = features + cat_encoder.categories_[0].tolist()
    features.remove("ocean_proximity") # We're encoding this variable, so we don't need it in our list anymore


    ### Scaling Numerical Data ###
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()

    housing_scaled = scaler.fit_transform(imputed)

    ### Concatening all Data ###
    output = np.hstack([housing_scaled, housing_cat_1hot])

    return output, labels, features

In [13]:
train_data, train_labels, features= data_transformations(strat_train_set)



In [14]:
test_data, test_labels, features = data_transformations(strat_test_set)



Leanear Regression 

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
lin_reg = LinearRegression()
lin_reg.fit(train_data, train_labels)

In [17]:
original_values = test_labels[:5]
Predicted_values = lin_reg.predict(test_data[:5])

In [18]:
comparision_dataframe = pd.DataFrame(data={"original_values" :original_values,
                                            "Predicted_values" : Predicted_values})

In [19]:
comparision_dataframe

Unnamed: 0,original_values,Predicted_values
5241,500001.0,430593.091392
17352,162500.0,285374.202481
3505,204600.0,232712.80035
7777,159700.0,194717.728868
14155,184000.0,248240.731


In [20]:
comparision_dataframe["differences"] = comparision_dataframe["original_values"] - comparision_dataframe["Predicted_values"]

In [21]:
comparision_dataframe

Unnamed: 0,original_values,Predicted_values,differences
5241,500001.0,430593.091392,69407.908608
17352,162500.0,285374.202481,-122874.202481
3505,204600.0,232712.80035,-28112.80035
7777,159700.0,194717.728868,-35017.728868
14155,184000.0,248240.731,-64240.731


Checking the error Root maean square 

In [22]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


In [23]:
lin_mes= mean_squared_error(original_values, Predicted_values)
lin_rms = np.sqrt(lin_mes)
lin_mes

5211793962.13275

In [24]:
lin_rms

72192.75560700499

In [25]:
lin_mae = mean_absolute_error(original_values, Predicted_values)
lin_mae

63930.674261317545

Decision Tree - 

In [26]:
from sklearn.tree import DecisionTreeRegressor

In [27]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train_data, train_labels)
#Probabilistic model and deterministic model 

In [28]:
train_predictions = tree_reg.predict(train_data)
tree_mse = mean_squared_error(train_labels, train_predictions)
tree_rmse =np.sqrt(tree_mse)
tree_rmse

0.0

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
scores = cross_val_score(tree_reg, train_data, train_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_score = np.sqrt(-scores)

In [31]:
def display_scores(scores):
    print("Score", scores)
    print("Mean", scores.mean())
    print("Standard Deviation", scores.std())
display_scores(tree_rmse_score)

Score [70824.86460222 69265.43911105 70860.89128052 73143.6242732
 68189.79683417 74845.58694955 75712.26299821 70925.68914136
 69652.74404875 72834.24516312]
Mean 71625.51444021543
Standard Deviation 2319.858754337931


In [32]:
from sklearn.ensemble import RandomForestRegressor

In [33]:
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(train_data, train_labels)

In [34]:
train_predictions = forest_reg.predict(train_data)
forest_mse = mean_squared_error(train_labels, train_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18796.98816862067

better model 

In [35]:
forest_scores = cross_val_score(forest_reg, train_data, train_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_score = np.sqrt(-forest_scores)
display_scores(forest_rmse_score)

Score [51625.50923449 49595.77026479 46903.21986648 52054.44580805
 48038.01373657 51150.37967136 53115.15634704 50223.15213256
 48384.96672942 54583.30969038]
Mean 50567.3923481122
Standard Deviation 2279.0119694027167


In [36]:
line_scores = cross_val_score(lin_reg, train_data, train_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-line_scores)).describe()

count       10.000000
mean     68414.805622
std       2279.550831
min      64628.721920
25%      66812.103554
50%      68717.588588
75%      69608.531770
max      71719.029644
dtype: float64

Fine tuning model - tning hyperparameters 
one way is - grid search 