In [1]:
import pandas as pd, numpy as np,  scipy.stats as st, matplotlib.pyplot as plt, requests, json, getpass, statsmodels.api as sm
from api_keys import geoapify_key

In [2]:
data_df = pd.read_csv("Clean_Data\cleaned_perth_data.csv")
data_df.drop_duplicates(subset=['ADDRESS'],inplace=True)
data_Lat = data_df.groupby("SUBURB")["LATITUDE"].mean()
data_Long = data_df.groupby("SUBURB")["LONGITUDE"].mean()
data_Sub = data_Long.index
Clean_data_df = pd.DataFrame({"Suburb": data_Sub,
                              "Latitude": data_Lat,
                              "Longitude": data_Long})
Clean_data_df["Distance_to_Beach"] = ""

In [3]:
base_url = "https://api.geoapify.com/v2/places"

limit = 20
categories = "beach"
apiKey = geoapify_key

for index, row in Clean_data_df.iterrows():
    latitude = Clean_data_df["Latitude"][index]
    longitude = Clean_data_df["Longitude"][index]

    # Add filter and bias parameters with the current city's latitude and longitude to the params dictionary
    params = {"bias": f"proximity:{longitude},{latitude}",
              "categories": categories,
              "limit": limit,
              "apiKey": apiKey,
              }

    beach_distance = requests.get(base_url, params = params)
    beach_distance = beach_distance.json()
    try:
        Clean_data_df.loc[index, "Distance_to_Beach"] = int(beach_distance["features"][0]["properties"]["distance"])
    except (KeyError, IndexError):
        Clean_data_df.loc[index, "Distance_to_Beach"] = "No beach is found"
    print(f"nearest {Clean_data_df.loc[index, 'Suburb']} - nearest beach is located by : {Clean_data_df.loc[index, 'Distance_to_Beach']}")

Clean_data_df

nearest Alexander Heights - nearest beach is located by : 10907
nearest Alfred Cove - nearest beach is located by : 2983
nearest Alkimos - nearest beach is located by : 2268
nearest Anketell - nearest beach is located by : 9941
nearest Applecross - nearest beach is located by : 1876
nearest Ardross - nearest beach is located by : 1368
nearest Armadale - nearest beach is located by : 4080
nearest Ascot - nearest beach is located by : 615
nearest Ashby - nearest beach is located by : 7217
nearest Ashfield - nearest beach is located by : 643
nearest Attadale - nearest beach is located by : 1535
nearest Atwell - nearest beach is located by : 9530
nearest Aubin Grove - nearest beach is located by : 8425
nearest Aveley - nearest beach is located by : 11525
nearest Balcatta - nearest beach is located by : 6766
nearest Baldivis - nearest beach is located by : 6505
nearest Balga - nearest beach is located by : 8357
nearest Ballajura - nearest beach is located by : 9618
nearest Banjup - nearest 

Unnamed: 0_level_0,Suburb,Latitude,Longitude,Distance_to_Beach
SUBURB,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alexander Heights,Alexander Heights,-31.827133,115.864632,10907
Alfred Cove,Alfred Cove,-32.033900,115.815981,2983
Alkimos,Alkimos,-31.615747,115.687668,2268
Anketell,Anketell,-32.215547,115.877233,9941
Applecross,Applecross,-32.013490,115.838477,1876
...,...,...,...,...
Wooroloo,Wooroloo,-31.801048,116.311440,7844
Wungong,Wungong,-32.181590,116.014520,7872
Yanchep,Yanchep,-31.544042,115.638093,1452
Yangebup,Yangebup,-32.122827,115.811484,4693


In [4]:
Clean_data_df["Price"] = data_df.groupby("SUBURB")["PRICE"].mean()
Clean_data_df["Land_Area"] = data_df.groupby("SUBURB")["LAND_AREA"].mean()
Clean_data_df["Distance_to_Beach"] = Clean_data_df["Distance_to_Beach"].astype("float64")
Clean_data_df.to_csv("Cleaned_Data.csv")
Clean_data_df

Unnamed: 0_level_0,Suburb,Latitude,Longitude,Distance_to_Beach,Price,Land_Area
SUBURB,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alexander Heights,Alexander Heights,-31.827133,115.864632,10907.0,4.490973e+05,625.787611
Alfred Cove,Alfred Cove,-32.033900,115.815981,2983.0,8.493350e+05,577.790000
Alkimos,Alkimos,-31.615747,115.687668,2268.0,4.142292e+05,10647.875000
Anketell,Anketell,-32.215547,115.877233,9941.0,1.005267e+06,20896.400000
Applecross,Applecross,-32.013490,115.838477,1876.0,1.396130e+06,703.917808
...,...,...,...,...,...,...
Wooroloo,Wooroloo,-31.801048,116.311440,7844.0,4.116774e+05,24271.387097
Wungong,Wungong,-32.181590,116.014520,7872.0,6.704650e+05,13261.222222
Yanchep,Yanchep,-31.544042,115.638093,1452.0,3.906562e+05,583.000000
Yangebup,Yangebup,-32.122827,115.811484,4693.0,4.702661e+05,665.807339


In [5]:
WA_IncomeProfile_df = pd.read_csv("raw_data\WA_LGA_Income and Rent.csv")
WA_IncomeProfile_df = WA_IncomeProfile_df.drop(["Median_age_persons", "Med_mort_repaymnt_monthly", "Avge_numbr_prsons_per_bedroom",
                                          "Average_household_size", "Median_rent_weekly"], axis = 1)

In [6]:
Suburb_codes = pd.read_csv("raw_data\WA_LGA_Suburb_codes.csv")
Cleaned_Data = pd.read_csv("Cleaned_Data.csv")
Suburb_profile = pd.merge(WA_IncomeProfile_df, Suburb_codes, on = "LGA_CODE_2021", how = "inner")
Suburb_HH_income = pd.merge(Cleaned_Data, Suburb_profile, how = "inner", on = "Suburb").drop("SUBURB", axis = 1)
Suburb_HH_income.rename(columns={"Med_tot_psnl_incom_weekly": "Personal Income",
                                 "Med_tot_family_inc_weekly": "Family Income",
                                 "Med_tot_hh_incom_weekly": "Household Income"}, inplace = True)
Suburb_HH_income.head(10)

Unnamed: 0,Suburb,Latitude,Longitude,Distance_to_Beach,Price,Land_Area,LGA_CODE_2021,Personal Income,Family Income,Household Income,Local government area
0,Alexander Heights,-31.827133,115.864632,10907.0,449097.3,625.787611,58760,801,2148,1891,Wanneroo
1,Alfred Cove,-32.0339,115.815981,2983.0,849335.0,577.79,55320,896,2654,2091,Melville
2,Alkimos,-31.615747,115.687668,2268.0,414229.2,10647.875,58760,801,2148,1891,Wanneroo
3,Anketell,-32.215547,115.877233,9941.0,1005267.0,20896.4,54830,792,1996,1712,Kwinana
4,Applecross,-32.01349,115.838477,1876.0,1396130.0,703.917808,55320,896,2654,2091,Melville
5,Ardross,-32.026771,115.838075,1368.0,1066473.0,547.149533,55320,896,2654,2091,Melville
6,Armadale,-32.148528,116.004218,4080.0,249043.9,859.368421,50210,801,2061,1768,Armadale
7,Ascot,-31.934462,115.928163,615.0,806406.0,686.008621,50490,880,2086,1641,Belmont
8,Ashby,-31.733524,115.799137,7217.0,483774.5,516.159509,58760,801,2148,1891,Wanneroo
9,Ashfield,-31.917406,115.937882,643.0,493561.6,663.849315,50350,855,2278,1735,Bassendean


In [7]:
Variables = np.asarray(pd.DataFrame({"Land": data_df["LAND_AREA"],
                                     "Bedrooms": data_df["BEDROOMS"],
                                     "Bathrooms": data_df["BATHROOMS"]}))
model = sm.OLS(data_df["PRICE"], Variables)
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                  PRICE   R-squared (uncentered):                   0.796
Model:                            OLS   Adj. R-squared (uncentered):              0.796
Method:                 Least Squares   F-statistic:                          3.948e+04
Date:                Tue, 05 Sep 2023   Prob (F-statistic):                        0.00
Time:                        20:21:46   Log-Likelihood:                     -4.2988e+05
No. Observations:               30429   AIC:                                  8.598e+05
Df Residuals:                   30426   BIC:                                  8.598e+05
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [8]:
critical_value = st.chi2.ppf(q = 0.95, df = 30498)
critical_value

30905.369404966397

In [9]:
Price_Area = st.ttest_ind(data_df["PRICE"], data_df["LAND_AREA"], equal_var=False)
Price_Area

TtestResult(statistic=310.16255594828897, pvalue=0.0, df=30566.70625247856)

In [10]:
Price_Bedrooms = st.ttest_ind(data_df["PRICE"], data_df["BEDROOMS"], equal_var=False)
Price_Bedrooms

TtestResult(statistic=311.81797176391666, pvalue=0.0, df=30428.000000268188)

In [11]:
Price_Bathrooms = st.ttest_ind(data_df["PRICE"], data_df["BATHROOMS"], equal_var=False)
Price_Bathrooms

TtestResult(statistic=311.81886999735764, pvalue=0.0, df=30428.000000165935)

In [12]:
Price_SchoolDistance = st.ttest_ind(data_df["PRICE"], data_df["NEAREST_SCH_DIST"], equal_var=False)
Price_SchoolDistance

TtestResult(statistic=311.81888701110336, pvalue=0.0, df=30428.000001379256)

In [13]:
Price_SchoolRank = st.ttest_ind(data_df["PRICE"], data_df["NEAREST_SCH_RANK"], equal_var=False, nan_policy= "omit")
Price_SchoolRank

TtestResult(statistic=311.78438716461386, pvalue=0.0, df=30428.00116419359)

In [14]:
Price_Beach = st.ttest_ind(Clean_data_df["Price"], Clean_data_df["Distance_to_Beach"], equal_var=False)
Price_Beach

TtestResult(statistic=39.57761869223503, pvalue=6.271207876284252e-125, df=318.12949066982463)

In [15]:
Income_Var = np.asarray(pd.DataFrame({"Personal_Income": Suburb_HH_income["Personal Income"],
                                     "Family_Income": Suburb_HH_income["Family Income"],
                                     "Household_Income": Suburb_HH_income["Household Income"]}))
Income_model = sm.OLS(Suburb_HH_income["Price"], Income_Var)
results_Income = Income_model.fit()
print(results_Income.summary())

                                 OLS Regression Results                                
Dep. Variable:                  Price   R-squared (uncentered):                   0.921
Model:                            OLS   Adj. R-squared (uncentered):              0.920
Method:                 Least Squares   F-statistic:                              1325.
Date:                Tue, 05 Sep 2023   Prob (F-statistic):                   6.86e-188
Time:                        20:32:58   Log-Likelihood:                         -4718.1
No. Observations:                 345   AIC:                                      9442.
Df Residuals:                     342   BIC:                                      9454.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [16]:
critical_value_2 = st.chi2.ppf(q = 0.95, df = 342)
critical_value_2

386.12517545484934