Investigation the staticstial realtionship between the price and two-side factors, which may contribute on price for rent and purchase of flat.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind
from scipy.stats import ks_2samp
import cliffs_delta
import re

path = "./../data/processed/"
files = os.listdir(path)
for file in files:
    data = pd.read_csv(path + file)
    print(file)
    column_list = data.columns

    stats_contrast = []
    stats_contrast_group = []  # Easier to compare; table contains from 0 to 1 values
    two_side_columns = []

    # Exstraction of binary features
    for col in column_list:
        num_unique = data[col].nunique()
        if num_unique == 2:
            two_side_columns.append(col)

    # Computing Statistical Values
    for col in two_side_columns:
        feature = data[data[col] == True]["price"]
        no_feature = data[data[col] == False]["price"]

        # Mann-Whteney U
        stat, p_MWU = mannwhitneyu(feature, no_feature, alternative="two-sided")

        if p_MWU < 0.05:
            e1 = 1
        elif p_MWU >= 0.05:
            e1 = 0

        # T-test
        t_stat, p_ttest = ttest_ind(feature, no_feature, equal_var=False)

        if abs(t_stat) < 1:
            e21 = 0
        elif abs(t_stat) < 1.96:
            e21 = 0.95
        elif abs(t_stat) < 2.58:
            e21 = 0.99
        else:
            e21 = 1

        if p_ttest < 0.05:
            e22 = 1
        elif p_ttest >= 0.05:
            e22 = 0

        # Kolmogorov-Smirnov Test
        ks_stat, p_ks = ks_2samp(feature, no_feature)
        ks_stat = ks_stat

        if p_ks < 0.05:
            e3 = 1
        else:
            e3 = 0

        # Cliff's Delta
        delta, res = cliffs_delta.cliffs_delta(feature.to_list(), no_feature.to_list())

        if abs(delta) < 0.147:
            e4 = 0
        elif 0.147 <= abs(delta) < 0.33:
            e4 = 0.50
        elif 0.33 <= abs(delta) < 0.474:
            e4 = 0.75
        elif abs(delta) >= 0.0474:
            e4 = 1

        new_record = {
            "feature": col,
            "p-value(MWU)": p_MWU,
            "stat": stat,
            "p-value(t-test)": p_ttest,
            "t_stat": t_stat,
            "p-value(KST)": p_ks,
            "D-statistic": ks_stat,
            "Cliff's Delta": delta,
            "Size Cliff's": res,
        }

        new_record_group = {
            "feature": col,
            "p-value(MWU)": e1,
            "p-value(t-test)": e22,
            "t_stat": e21,
            "p-value(KST)": e3,
            "D-statistic": ks_stat,
            "Cliff's Delta": e4,
        }

        stats_contrast.append(new_record)
        stats_contrast_group.append(new_record_group)

        data_stats = pd.DataFrame(stats_contrast)
        data_stats_range = pd.DataFrame(stats_contrast_group)

        data_stats_range["Sum_of_rows"] = data_stats_range.select_dtypes(
            include=[np.number]
        ).sum(axis=1)

        # Distribution graphs
        data[col] = data[col].map({False: "No", True: "Yes"})
        plt.figure(figsize=(10, 6))
        sns.violinplot(
            data=data,
            x=col,
            y="price",
            hue=col,
            inner="box",
            palette="coolwarm",
            split=True,
            gap=-0.24,
        )
        sns.stripplot(
            data=data,
            x=col,
            y="price",
            hue=col,
            palette="dark",
            alpha=0.1,
            jitter=True,
            dodge=True,
        )

        factor = re.sub(r"(?<=[a-z])(?=[A-Z])", " ", col[3:])

        plt.title("Price Distribution by " + factor + " Availability", fontsize=15)
        plt.xlabel("Presence of " + factor)
        plt.ylabel("Price [zł]")

        plt.grid(True, which="both", linestyle="--")
        plt.minorticks_on()
        plt.tight_layout()

        plt.savefig(
            "./../plots/data_two_sides/" + file[0] + "_" + col[3:] + "_distribution"
        )
        plt.show()

    data_stats.to_csv("./../reports/" + file[0] + "_stats_" + file[2:], index=False)
    data_stats_range.to_csv(
        "./../reports/" + file[0] + "_stats_easy_" + file[2:], index=False
    )

    print("=================Stats for " + file + "=====================")
    print(data_stats)
    print("---------------------")
    print(data_stats_range)

Selecting feature with the leargest difference to price

In [None]:
import pandas as pd

path = "./../reports/"

data_rent = pd.read_csv(path + "1_stats_easy_analysis_rent.csv", index_col=0)
data_purchase = pd.read_csv(path + "2_stats_easy_analysis_purchase.csv", index_col=0)

data_rent = data_rent.sort_values("Sum_of_rows", ascending=False)
data_purchase = data_purchase.sort_values("Sum_of_rows", ascending=False)


print("Rent \n", data_rent[[data_rent.columns[-1]]])

print("Purchase \n", data_purchase[[data_purchase.columns[-1]]])

We can obsere stron corelation with price and presence of elevator, storageroom and security for rent, and presence of security, storage room and parking space for purchase. Although for all features, strong colleration with price is observed.