# <center> Ames Housing Prices

### Import Necessary Modules

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
from plotnine import *    # https://plotnine.readthedocs.io/en/stable/api.html#geoms
%matplotlib inline 
import math

### Importing Housing Dataset

In [None]:
# Here is the url of the housing dataset :
url = "https://github.com/DrSaadLa/MLLabs/raw/main/data/housing.csv"

In [None]:
housing = pd.read_csv(url)

In [None]:
# Checking the first five instances :
housing.head()

In [None]:
# Checking the first five instances:
housing.tail()

### Explore Dataset
The dataset __housing__ contains :

- 5000 instances (observations)

- 6 quantitatives(numericals) variables

- 1 categorical variables : Adresse

- No Missing Values

- No Duplicated Instances


In [None]:
# check the data information :
housing.info()

In [None]:
# checking duplicated instances : 
housing.duplicated().sum()

In [None]:
# Run desriptive statistics for numerical variables :
housing.describe().transpose()

### Delete the variable "Address" 

In [None]:
housing.drop('Address' , axis = 1 , inplace =True)

### Rename The  Variables

In [None]:
housing.rename(columns={'Avg. Area Income':'Area Income', 'Avg. Area House Age':'House Age', 
        'Avg. Area Number of Rooms':'Number of Rooms','Avg. Area Number of Bedrooms':'Number of Bedrooms'}
               ,inplace=True)

### Shapiro's Test on the target variable : Price 

- **The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.**
- **p_value = 0.95 > 0.05 we don't have enough evidence to reject the hypothesis of normality distribution of the price of houses** 

In [None]:
# import necessary module to evaluate the Shapiro's Test 
from scipy import stats   

In [None]:
shapiro_test = stats.shapiro(housing['Price'])
shapiro_test

 ### Histogram of Price

In [None]:
ggplot(housing, aes(x='Price')) + geom_histogram(bins=20 , fill="red" ,color = "green" ,  alpha = 0.75) +labs(title = "Histogram Of Houses Prices" ,x = "Price" , y = "Frequency"  ) + theme_minimal()

### Pearson's Correlation Coefficents Between The Quantitative Variables

- **The linear relationship between the house price and the number of bedrooms is negligeable of 0.17**

- **There is a weak positive linear relationship between the house price and the Avg. Area Number of Rooms of 0.34**

- **There is a moderate positive linear relationship between the house price and Area Population of 0.41 and Avg. Area House Age of 0.45 and Avg. Area Income of 0.64**

In [None]:
# Correlation Matrix : 
correlation_matrix = housing.corr()

In [None]:
# Correlation Heatmap :
sns.heatmap(correlation_matrix , annot=True)

In [None]:
# correlation pairplot :
sns.pairplot(housing)

### Features and Target 

In [None]:
# Features : 
X = housing[['Area Income', 'House Age', 'Number of Rooms', 'Number of Bedrooms',
       'Area Population']]

# Target : 
y= housing['Price']

### Splitting Data , Training  and Assessment The Model With Different Seeds 

In [None]:
from sklearn.model_selection import train_test_split               # splitting data 
from sklearn.linear_model import LinearRegression                  # training model 
from sklearn.metrics import r2_score , mean_squared_error          # assessment model

### Standardization 

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
?MinMaxScaler

In [None]:
## initiate object 
scaler = MinMaxScaler()

In [None]:
## fitting : 
scaler.fit(X_train)

In [None]:
## transform 
X_train = scaler.transform(X_train)

In [None]:
# or : 
scaler.fit_transform(X_train)

In [None]:
X_train.max()

In [None]:
X_train.min()

In [None]:
# creat a linear regression object :
mulin = LinearRegression()

In [None]:
seed = [50,100,150,250,1000,2000,5000,8000,12000,21000]
r2_score_seed = []
RMSE_seed=[]


for i in seed :
    
    # splitting the data into train and test set :
    X_train , X_test , y_train , y_test = train_test_split(X , y , train_size  = 0.7 , random_state = i)
    
    # Creat and Fit a multiple linear regression model :
    mulin = LinearRegression()
    mulin.fit(X_train , y_train)
    
    # Make prediction on test set :
    y_hat = mulin.predict(X_test)
    
    # model assessment : R^2 and RMSE :
    r2_score_seed.append(r2_score(y_test,y_hat))
    RMSE_seed.append(np.sqrt(mean_squared_error(y_test , y_hat)))

# show results :

print("The Coefficient of Determination of 10 models are : \n" , r2_score_seed )

print("\n \n The Mean of The Coefficient of Determination of 10 models is :" , np.array(r2_score_seed).mean())

print("\n \n The Root Mean Square Error of 10 models are : \n", RMSE_seed )

print("\n \n The Mean of The Root Mean Square Error of 10 models is : " , np.array(RMSE_seed).mean())   
    
    

### Splitting Data , Training  and Assessment The Model With Different Test Sample Size 

In [None]:
size = np.linspace(0.63,0.9,10)
r2_score_size = []
RMSE_size=[]


for i in size :
    
    # splitting the data into train and test set :
    X_train , X_test , y_train , y_test = train_test_split(X , y , train_size  = i , random_state = 4433)
    
    # Creat and Fit a multiple linear regression model :
    mulin = LinearRegression()
    mulin.fit(X_train , y_train)
    
    # Make prediction on test set :
    y_hat = mulin.predict(X_test)
    
    # model assessment : R^2 and RMSE :
    r2_score_size.append(r2_score(y_test,y_hat))
    RMSE_size.append(np.sqrt(mean_squared_error(y_test , y_hat)))
    
# show results :

print("The Coefficient of Determination of 10 models are : \n" , r2_score_size )

print("\n \n The Mean of The Coefficient of Determination of 10 models is :" , np.array(r2_score_size).mean())

print("\n \n The Root Mean Square Error of 10 models are : \n", RMSE_size )

print("\n \n The Mean of The Root Mean Square Error of 10 models is : " , np.array(RMSE_size).mean())

### K-fold Cross Validation 

In [None]:
# Import cross_val_score class :
from sklearn.model_selection import cross_val_score    
from sklearn.metrics import make_scorer

In [None]:
# make r2_cv scorer : 
r2_score_cv = make_scorer(r2_score)

# perform 10-fold Cross Validation : 
cv_10_fold = cross_val_score(mulin , X, y, cv = 10 , scoring = r2_score_cv)

# show results : 
print("The Coefficient of Determination of 10 models are : \n" , cv_10_fold )

print("\n \n The Mean of The Coefficient of Determination of 10 models is :" , np.array(cv_10_fold).mean())

In [None]:
# make RMSE_cv scorer : 
RMSE_cv = make_scorer(mean_squared_error)

# perform 10-fold Cross Validation : 
cv_10_fold = cross_val_score(mulin , X, y, cv = 10 , scoring = RMSE_cv)

# show results : 
print("The Root Mean Square Error of 10 models are : \n", np.sqrt(np.array(cv_10_fold)))

print("\n \n The Mean of The Root Mean Square Error of 10 models is : " , np.sqrt(np.array(cv_10_fold)).mean())


In [None]:
housing.head()