In [15]:

#Import Packages
import pandas as pd
import numpy as np
import glob
from pandasql import sqldf
import datetime
import requests
import bs4

In [16]:

#Global
pysqldf = lambda q: sqldf(q, globals())
now = datetime.datetime.now()
pd.set_option('display.float_format', '{:.4f}'.format)

In [17]:

#import
df_private = pd.read_excel('Data/TransactionDatabase.xlsx')
df_rental = pd.read_excel('Data/RentalDatabase.xlsx')
df_developer = pd.read_excel('Data/DeveloperDatabase.xlsx')
df_pipeline = pd.read_excel('Data/Pipeline.xlsx')
df_regionmapping = pd.read_excel('Data/Region District.xlsx')
df_private.drop(df_private.filter(regex="Unnamed"),axis=1, inplace=True)
df_rental.drop(df_rental.filter(regex="Unnamed"),axis=1, inplace=True)
df_developer.drop(df_developer.filter(regex="Unnamed"),axis=1, inplace=True)
df_pipeline.drop(df_pipeline.filter(regex="Unnamed"),axis=1, inplace=True)
df_regionmapping.drop(df_pipeline.filter(regex="Unnamed"),axis=1, inplace=True)

In [18]:

#Transform Data
df_propertydetails = df_private[['project','TenureType','BuildYear','LeaseYear']].drop_duplicates(subset = ['project'],keep = 'first')
# #Check distinct
# q = """ select count(project) as cnt, project from df_propertydetails group by project having cnt > 1 """
# queryresult = pysqldf(q)
# queryresult
#join region details
df_pipeline = df_pipeline.merge(df_regionmapping[['Postal District','Region']],how='left', left_on='district', right_on='Postal District')
df_rental = df_rental.merge(df_regionmapping[['Postal District','Region']],how='left', left_on='district', right_on='Postal District')
#Join property details
df_rental = df_rental.merge(df_propertydetails,how='left', left_on='project', right_on='project')
df_rental['TenureType'] = df_rental['TenureType'].fillna('NA')

#per unit
df_private['priceperunit'] = df_private['price']/df_private['noOfUnits']
df_private['areaperunit'] = df_private['area']/df_private['noOfUnits']

#getYear
df_private['year']= pd.to_datetime(df_private['contractDate']).dt.year


In [19]:

#Scrap TOP information
def findTOP (project):
    text= project + " Singapore top date"
    url = 'https://google.com/search?q=' + text
    # Fetch the URL data using requests.get(url),
    # store it in a variable, request_result.
    request_result=requests.get( url )  
    # Creating soup from the fetched request
    soup = bs4.BeautifulSoup(request_result.text,
                            "html.parser")
    # with open("output.html", "w", encoding = 'utf-8') as file:
    # # prettify the soup object and convert it into a string  
    #     file.write(str(soup.prettify()))
    element = soup.find_all("td",{'class': 'sjsZvd s5aIid OE1use'}) 
    ele = 0
    for i in element:
        try:
            if int(i.text[-19:14][-4:])>ele and int(i.text[-19:14][-4:]) > 2000:
                    ele= int(i.text[-19:14][-4:])  
        except ValueError:
            try:
                if int('20'+i.text[-26:14][-2:]) > ele and int('20'+i.text[-26:14][-2:]) < 2030 and int('20'+i.text[-26:14][-2:]) > 2000:
                    ele = int('20'+i.text[-26:14][-2:])
            except: 
                ele = ele                   
        finally:
            ele = ele
    try:
        return ele
    except:
        return "NA"

#Get Unique projects from developer list and pipeline
maxdate = df_developer['refPeriod'].max()
df_developer_latest = df_developer[df_developer['refPeriod'] == maxdate]
df_project = df_developer_latest[['project','district','street']].append(df_pipeline[['project','district','street']]).drop_duplicates()
df_project['TOP'] = df_project.apply(lambda x: findTOP(x['project']),axis = 1)

In [34]:
#Infer capital gain from new sale data
#Identify different type of sales
#	The type of sale
#1 – New Sale
#2 – Sub Sale
#3 – Resale

#Join average cost based on:
# resale: price 3 years ago - ssd applicable if sold <= 3 years of ownership (2 years average)
# subsale: price CY and PY - before top (2 years average)
def resalecost(project,year,area,floorRange):
   # df_years = df_private.loc[(df_private['year']<= year-3 ) & (df_private['year'] >= year-5) & (df_private['project'] == project) & (df_private['area'] == area) & (df_private['floorRange'] == floorRange)]
   df_new = df_private.loc[(df_private['typeOfSale'] == 1)& (df_private['project'] == project) & (df_private['area'] == area) & (df_private['floorRange'] == floorRange)]
   cost = df_new["priceperunit"].mean()
   return cost
def subsalecost(project,year,area,floorRange):
   # df_years = df_private.loc[(df_private['year']<= year ) & (df_private['year'] >= year-1) & (df_private['project'] == project) & (df_private['area'] == area) & (df_private['floorRange'] == floorRange)]
   df_new = df_private.loc[(df_private['typeOfSale'] == 1)& (df_private['project'] == project) & (df_private['area'] == area) & (df_private['floorRange'] == floorRange)]
   cost = df_new["priceperunit"].mean()
   return cost

maxdate = df_private['year'].max()
mindate = maxdate - 5
df_privatecondo = df_private.loc[(df_private['propertyType'].isin(['Condominium','Apartment','Executive Condominium'])) & (df_private['year'] > mindate)]
df_privatecondo['cost'] = 0
df_privatecondo['cost'] = df_privatecondo.apply(lambda x : resalecost(x['project'],x['year'],x['area'],x['floorRange']) if x['typeOfSale'] == 3 else (subsalecost(x['project'],x['year'],x['area'],x['floorRange']) if x['typeOfSale'] == 2 else None),axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [36]:

#combine open projects
df_pipeline['project'] = df_pipeline['project'].str.upper()

df_pipeline['project'] = df_pipeline.apply(lambda x : 'PARC KOMO' if x['project'] == 'PARC KOMO/KOMO SHOPPES' else x['project'], axis = 1)

df_pipelinena = df_pipeline.merge(df_developer_latest[['project','unitsAvail']], how='left', left_on='project',right_on='project')
df_pipelinenaresults = df_pipelinena[df_pipelinena['unitsAvail'].isna()]
inpipeline = df_pipelinenaresults['project'].values.tolist() 

df_pipelinefilter = df_pipeline[df_pipeline['project'].isin(inpipeline)]
df_pipelinefilter2 = df_pipelinefilter[['street','district','project','developerName','totalUnits','Region']]
df_pipelinefilter2.rename(columns={"developerName": "developer", "totalUnits": "unitsAvail", "Region": "marketSegment"},inplace= True)
df_pipelinefilter2['refPeriod'] = maxdate
df_openprojects = df_developer.append(df_pipelinefilter2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [37]:
#join if open project

df_opendistinct = df_openprojects[['project','marketSegment','district','developer']].drop_duplicates()
df_privatecondo1 =  df_privatecondo.merge(df_opendistinct,how='left', left_on=['project','marketSegment','district'], right_on=['project','marketSegment','district'])

In [38]:

#join data - TOP,Tenure
df_openprojects = df_openprojects.merge(df_propertydetails,how='left', left_on='project', right_on='project')
df_openprojects['TenureType'] = df_openprojects['TenureType'].fillna('NA')
df_openprojects = df_openprojects.merge(df_project[['project','TOP']],how='left', left_on='project', right_on='project')
df_openprojects['TOP'] = df_openprojects['TOP'].fillna('NA')

In [39]:

df_privatecondo1.to_excel('Data/PrivateTransactionClean.xlsx')
df_rental.to_excel('Data/RentalClean.xlsx')
df_openprojects.to_excel('Data/OpenProjectsClean.xlsx')
df_private.to_excel('Data/TransactionClean.xlsx')