In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.decomposition import PCA
import statsmodels.api as sm
from matplotlib.patches import Circle

# Reading the data into pandas dataframe
data = pd.read_csv('Real estate.csv')

# Dropping the No column, as it is non-informative
data = data.drop(['No'], axis=1)

print(data.head())

print(data.shape)

print(data.info())

# Checking if any column contains null values
print(data.isnull().sum())

# The description of all columns, including mean, min, max and quantile statistics
print(data.describe())

# Pairwise scatterplots of columns, with KDE plot in diagonal
sns.pairplot(data, diag_kind='kde')

# Correlation heatmap of columns
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap of Variables")
plt.show()

# Preparing independent and dependant (house price of unit area) variables for linear regression
X= data.drop('Y house price of unit area', axis=1)
y= data['Y house price of unit area']

# Adding the intercept
X = sm.add_constant(X)

# Fitting first linear regression model based on all variables
model = sm.OLS(y, X).fit()

print(model.summary())

# Choosing the variables based on p-values for updated model
X= data[['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude']]

model = sm.OLS(y, X).fit()
print(model.summary())

from sklearn.preprocessing import StandardScaler

# Standardizing the data for PCA
Xstd = StandardScaler().fit_transform(X)

pca = PCA()
pca.fit(Xstd)

# The explained variance ratio of components
print(pca.explained_variance_ratio_)

# The cumulative variance of components (1, 1+2, 1+2+3 etc.)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
print(cumulative_variance)

print(pca.components_)

# Transforming the data based on PCA weights
X_pca = pca.transform(Xstd)

model = sm.OLS(y, X_pca).fit()
print(model.summary())

# Calculating the correlations and euclidean distances between original and projected data
ccircle = []
eucl_dist = []
for column_name in X.columns:
    corr1 = np.corrcoef(X[column_name],X_pca[:,0])[0,1]
    corr2 = np.corrcoef(X[column_name],X_pca[:,1])[0,1]
    ccircle.append((corr1, corr2))
    eucl_dist.append(np.sqrt(corr1**2 + corr2**2))
    

# Plotting the PCA correlation circle
with plt.style.context(('seaborn-whitegrid')):
    fig, axs = plt.subplots(figsize=(6, 6))
    for i,j in enumerate(eucl_dist):
        arrow_col = plt.cm.cividis((eucl_dist[i] - np.array(eucl_dist).min())/\
                                (np.array(eucl_dist).max() - np.array(eucl_dist).min()) )
        axs.arrow(0,0, # Arrows start at the origin
                 ccircle[i][0],  #0 for PC1
                 ccircle[i][1],  #1 for PC2
                 lw = 2, # line width
                 length_includes_head=True, 
                 color = arrow_col,
                 fc = arrow_col,
                 head_width=0.05,
                 head_length=0.05)
        axs.text(ccircle[i][0]/2,ccircle[i][1]/2, X.columns[i])
    # Draw the unit circle, for clarity
    circle = Circle((0, 0), 1, facecolor='none', edgecolor='k', linewidth=1, alpha=0.5)
    axs.add_patch(circle)
    axs.set_xlabel("PCA 1")
    axs.set_ylabel("PCA 2")
plt.tight_layout()
plt.show()

sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1])