In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

%matplotlib qt5
sns.set_style('whitegrid')
sns.set_context('paper')
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 0.5

In [2]:
from typing import Optional
import numbers

def auto_opt_pd_dtypes(df_: pd.DataFrame, inplace=False) -> Optional[pd.DataFrame]:
    """ Automatically downcast Number dtypes for minimal possible,
        will not touch other (datetime, str, object, etc)
        :param df_: dataframe
        :param inplace: if False, will return a copy of input dataset
        :return: `None` if `inplace=True` or dataframe if `inplace=False`
    """
    df_temp = df_ if inplace else df_.copy()
    print(df_temp.info())

    for col in df_temp.columns:
        # integers
        if issubclass(df_temp[col].dtypes.type, numbers.Integral):
            # unsigned integers
            if df_temp[col].min() >= 0:
                df_temp[col] = pd.to_numeric(df_temp[col], downcast='unsigned')
            # signed integers
            else:
                df_temp[col] = pd.to_numeric(df_temp[col], downcast='integer')
        # other real numbers
        elif issubclass(df_temp[col].dtypes.type, numbers.Real):
            df_temp[col] = pd.to_numeric(df_temp[col], downcast='float')

        elif issubclass(df_temp[col].dtypes.type, np.object_):
            df_temp[col] = pd.Categorical(df_temp[col])

    print(df_temp.info())
    if not inplace:
        return df_temp

In [3]:
df = pd.read_csv('sample_data/california_housing_train.csv', header='infer', delimiter=',')
df = auto_opt_pd_dtypes(df)

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           20640 non-nu

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.230003,37.880001,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.220001,37.860001,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.239998,37.849998,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.849998,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.849998,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.56971,35.631863,28.639486,2635.763184,537.870544,1425.476685,499.539673,3.870671,206855.8125
std,2.003532,2.135952,12.585557,2181.615234,421.385071,1132.462158,382.329773,1.899822,115395.617188
min,-124.349998,32.540001,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.800003,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.489998,34.259998,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.010002,37.709999,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.309998,41.950001,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [5]:
fig, axs = plt.subplots(ncols=5, nrows=2, figsize=(15, 8))

index = 0
axs = axs.flatten()
for k,v in df.items():
    sns.boxplot(y=k, data=df, ax=axs[index])
    index += 1
plt.tight_layout()
plt.show()

In [6]:
df_num = df.iloc[:, :-1]
for k,v in df_num.items():
    q1 = v.quantile(0.25)
    q3 = v.quantile(0.75)
    iqr = q3 - q1
    v_outliers = v[(v <= q1 - 1.5 * iqr) | (v >= q3 + 1.5 * iqr)]
    print(f'outliers in {k}: {len(v_outliers)/len(v)*100:.2f}%')

outliers in longitude: 0.00%
outliers in latitude: 0.00%
outliers in housing_median_age: 0.00%
outliers in total_rooms: 6.24%
outliers in total_bedrooms: 6.16%
outliers in population: 5.80%
outliers in households: 5.91%
outliers in median_income: 3.30%
outliers in median_house_value: 5.19%


In [7]:
df = df[df['median_house_value'] <= 300_000]

In [8]:
fig, axs = plt.subplots(ncols=5, nrows=2, figsize=(15, 8))

index = 0
axs = axs.flatten()
for k,v in df.items():
    sns.histplot(x=k, data=df, ax=axs[index], kde=True)
    index += 1
plt.tight_layout()
plt.show()

In [12]:
sns.heatmap(df.iloc[:, :-1].corr(), annot=True)
plt.tight_layout()
plt.show()

In [10]:
from sklearn.decomposition import PCA

num_data = df.iloc[:, :-1]
pca_input = num_data.dropna()
pca = PCA(n_components=2)
pca_data = pca.fit_transform(pca_input)

plt.scatter(x=pca_data[:, 0], y=pca_data[:, 1])
plt.tight_layout()

In [16]:
sns.pairplot(num_data)
plt.tight_layout()
plt.show()

AttributeError: Axes.set() got an unexpected keyword argument 'figsize'

In [26]:
from sklearn.preprocessing import MinMaxScaler

minmaxscaler = MinMaxScaler()
cols_sel = [col for col in num_data.columns if col != 'median_house_value']
x = pd.DataFrame(data=minmaxscaler.fit_transform(num_data[cols_sel]), columns=cols_sel)
y = df['median_house_value']

In [27]:
fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(15, 8))

index = 0
axs = axs.flatten()
for k,v in x.items():
    sns.regplot(x=x[k], y=y, ax=axs[index])
    index += 1
plt.tight_layout()
plt.show()