In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.express as px
from pandas.plotting import scatter_matrix
sns.set(style='darkgrid')
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

In [None]:
data = pd.read_csv(r'/kaggle/input/california-housing-prices/housing.csv')
df = data.copy()
df.head()

# Exploratory Data Analysis

In [None]:
df.info();

* all columns are numbers except *ocean_proximity*
* total_bedrooms has null values 
* no of columns = 10
* no of rows = 20640

In [None]:
def data_info(data):
    cols,dtype,nulls,duplicates,uniques = [],[],[],[],[]
    
    for col in data.columns:
        cols.append(col)
        dtype.append(data[col].dtype)
        nulls.append(data[col].isnull().sum())
        duplicates.append(data.duplicated().sum())
        uniques.append(data[col].nunique())
        
    
    df = pd.DataFrame({'Column':cols, 'DType':dtype, 'no of Nulls':nulls, 'no of Uniques':uniques ,'Duplicated rows':duplicates})
    return df    

In [None]:
data_info(df)

In [None]:
df.describe().T

In [None]:
df.skew(numeric_only=True)

In [None]:
df[df.columns[df.dtypes=='float64']].skew().sort_values(ascending=False)


* right skewed cols : population, total_rooms, total_bedrooms, households, median_income, median_house_value
* no left skewed cols

In [None]:
df['ocean_proximity'].nunique()
df.groupby(df['ocean_proximity']).count()



* 5 classes in the categorical column
* ISLAND class has the min no of entries and <1H OCEAN has the max 

# Data Visualization

In [None]:
sns.countplot(x= df['ocean_proximity'], palette= 'mako');

* The majority class is '<1H OCEAN'
* The minority class is 'ISLAND'

In [None]:
plt.pie(df['ocean_proximity'].value_counts(normalize=True), autopct='%1.0f%%',labels=df['ocean_proximity'].unique() );

In [None]:
plt.figure(figsize = (20,10))
df.hist()
plt.tight_layout();

* assures the skewness of the cols

In [None]:
def columns_histplot(data):
    l = len(data.columns)
    plt.figure(figsize=(15, 10))
    for i in range(l):
        plt.subplot(3, 3, i + 1)
        sns.histplot(data[data.columns[i]], bins=10, kde=True)
        plt.title(f'HistPlot of {data.columns[i]}', fontsize=14, color='darkblue')
        plt.xticks(rotation=45)
        plt.ylabel('Frequency')

    plt.tight_layout()
    plt.show()

In [None]:
df_numerical = df.select_dtypes(include=('number'))
columns_histplot(df_numerical)

In [None]:
def columns_boxplot(data):
    l = len(data.columns)
    plt.figure(figsize=(20,30))
    for i in range(l):
        plt.subplot(l,1, i+1)
        sns.boxplot(x=data[data.columns[i]])
        plt.title(f'BoxPlot of {data.columns[i]}', fontsize=22, color ='darkblue')
        

    plt.tight_layout()  
    plt.show()
        

In [None]:
columns_boxplot(df_numerical)

There are outliers in : 
1. total_rooms
1. total_bedrooms
1. population
1. households
1. median_income
1. median_house_value

In [None]:
def columns_barplot(df, data, category):
    l = len(data.columns)
    plt.figure(figsize=(10,30))
    for i in range(l):
        plt.subplot(l,1, i+1)
        sns.barplot(x= df[category], y=data[data.columns[i]])
        plt.title(f'BarPlot of {data.columns[i]} related to the {category}', fontsize=14, color ='darkblue')
        

    plt.tight_layout()  
    plt.show()

In [None]:
columns_barplot(df, df_numerical, 'ocean_proximity')

In [None]:
corr = df_numerical.corr()
sns.heatmap(corr, annot = True,cmap='rocket',linewidths=2 );

* total_rooms is highly correlated with total_bedrooms, population, households
* median_income is averagely correlated with median_house_value
* longitude has strong negative correlation with latitude

In [None]:
nullmap = df.isnull()
sns.heatmap(nullmap, annot=False, cbar= False, yticklabels= False);

# Data Preprocessing and cleaning

In [None]:
df = data.copy()

****Fill Nulls****

In [None]:
df.isnull().sum()

In [None]:
def columns_fillna(data):
    for col in data.columns:
        if col in (data.select_dtypes(include=['number'])):
            data[col] = data[col].fillna(data[col].median())
        elif col in (data.select_dtypes(include=['object'])):
            data[col] = data[col].fillna(data[col].mode()[0])
                        
    return data

In [None]:
df = columns_fillna(df)
df.isnull().sum()

****Detect Outliers****

In [None]:
def columns_outlier(data):
    for col in data.columns:
        if col in (data.select_dtypes(include=['number'])):
            q1,q3 = data[col].quantile([0.25,0.75])
            iqr = q3 - q1
            lower = q1- 1.5*iqr
            upper = q3+ 1.5*iqr
            outlier = (data[col]<lower) | (data[col]>upper)
            data = data.drop(data[outlier].index, axis=0)
        elif col in (data.select_dtypes(include=['object'])):
            data[col].count()
            
            if len(data[col])<len(data.columns):
                data = data.drop(data[col], axis=0)
        data.reset_index(drop=True, inplace=True)
    return data
columns_outlier(df).info()

In [None]:
df.info()

In [None]:
df = columns_outlier(df)
df.info()

****Skewness****

In [None]:
numeric_features = df.select_dtypes(include=[np.number])
df_skew =numeric_features.skew().sort_values(ascending=False)
df_skew

There is no left skewed columns
but there are 3 right skewed columns

In [None]:
right_skewed = df_skew[df_skew>0.50]
right_skewed

In [None]:
def logTrans(data,features):
    for feature in features:
        logTr = ColumnTransformer(transformers = [('lg', FunctionTransformer(np.log1p), [feature])])
        plt.figure(figsize=(15,6))
        plt.subplot(1,2,1)
        plt.title(f"Distribution of {feature} before Transformation", fontsize=15)
        sns.histplot(data[feature], kde=True, color="red")
        plt.subplot(1,2,2)
        
        df_log = pd.DataFrame(logTr.fit_transform(data))
        plt.title(f"Distribution of {feature} after Transformation", fontsize=15)
        sns.histplot(df_log,bins=20, kde=True , legend=False)
        plt.xlabel(feature)
        plt.show()
        print(f"Skewness was {round(data[feature].skew(),2)} before & is {round(df_log[0].skew(),2)} after Log transformation.")
        data[feature] = df_log[0]
    return data


In [None]:
df = logTrans(df,right_skewed.index)

In [None]:
df.head()

In [None]:
numeric_features.skew()