In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
import scipy.stats as stats
from scipy.stats import chi2_contingency


In [None]:
#Drop features which are not relevent
df = df.drop(['Unnamed: 0', 'source',
           'land_surface', 'facades_number', 'swimming_pool_has','postcode_median_price',
        'building_state_median_price', 'property_subtype_median_price',
        'building_property_subtype_median_facades',
        'property_subtype_median_facades'],axis=1)


In [None]:
#calculating price per metre square to detect outliers 
df['price_per_sqmtr'] = df['price'] / df['area']

In [None]:
#postcode_stats contains the no. of properties in each postcode
postcode_stats = df['postcode'].value_counts(ascending=False)

In [None]:
'''Any location having less than 10 data points should be tagged as "9999" location.
 This way number of categories can be reduced by huge amount.
  Later on when we do one hot encoding, it will help us with having fewer dummy columns'''

postcode_value_less_than_10 = postcode_stats[postcode_stats<=10]
postcode_value_less_than_10
df['postcode'] = df['postcode'].apply(lambda x: '9999' if x in postcode_value_less_than_10 else x)

In [None]:

''' Outlier Removal Using Standard Deviation and Mean '''
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key,subdf in df.groupby('postcode'):
        m =np.mean(subdf.price_per_sqmtr)
        sd = np.std(subdf.price_per_sqmtr)
        reduced_df = subdf[(subdf.price_per_sqmtr > (m - (2 * sd))) & (subdf.price_per_sqmtr <= (m +(2 * sd))) ]
        df_out = pd.concat([df_out,reduced_df],ignore_index= True)
    return df_out

In [None]:
#Applying the function on our dataframe
df = remove_pps_outliers(df)

''' Now, we can drop price per metre square column as our outlier detection is done '''
df = df.drop(['price_per_sqmtr'],axis='columns')


In [None]:

''' To check correlation of features using chisquare method
It gives us the list of featues which are relevant and which are not relevant for the model '''
class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        #print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)

#Initialize ChiSquare Class
cT = ChiSquare(df)

#Feature Selection
testColumns = ['postcode', 'house_is', 'property_subtype', 'rooms_number',
       'area', 'equipped_kitchen_has', 'furnished', 'open_fire', 'terrace',
       'garden', 'region', 'building_state_agg']
for var in testColumns:
    cT.TestIndependence(colX=var,colY="price" )  


In [None]:
''' Drop the features which are irrelevant as per chi-square '''
df = df.drop(['property_subtype',
       'furnished','garden'],axis = 1)


In [None]:

''' Use One Hot Encoding For postcodes '''
dummies = pd.get_dummies(df,prefix= '',prefix_sep = '')
df = dummies.drop(['9999','to_renovate','B',],axis='columns')


In [None]:

X = df.drop(['price'],axis='columns')
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)
