In [3]:
# importing libraries
# data manipulation and cleaning
import numpy as np
import pandas as pd
from pandas.tseries.offsets import MonthEnd
import datetime
from functools import reduce



# visualisation
import matplotlib.pyplot as plt
import matplotlib.ticker as tick
import seaborn as sns

plt.style.use('ggplot')




In [10]:
def ppr_report(csv):
    """
    This function generates the PPR - Export report
    It loads a csv and applies the transformation needed to the raw data.
    The input is the string type of the csv name
    IMPORTANT: The output of the report already includes the exclusion parameters
    """
    
    ppr_export = pd.read_csv(str(csv)+str('.csv'))
    # the only column to be renamed is the Date column 
    ppr_export.rename(columns = {'Date of Sale (dd/mm/yyyy)':'Date'}, inplace = True)

    # we also want to transform the object-type date column into a date format for easier handling later on
    ppr_export['Date'] = pd.to_datetime(ppr_export['Date'])

    #add the last day of the month
    ppr_export['Month_end_day'] = pd.to_datetime(ppr_export['Date'], format="%Y%m") + MonthEnd(0)
    
    # # convert other columns to category as they are easier data types
    ppr_export['Price'] = ppr_export['Price'].str.replace(',', '').astype(float)

    # here we remove the duplicates and bulk transactions
    ppr_export['Address'] = ppr_export['Address'].str.lower()
    ppr_export.drop_duplicates(subset ="Address", 
                     keep = 'first', inplace = True)
    ppr_export = pd.concat([ppr_export, ppr_export['Address'].str.split(', ', expand=True)], axis=1)
    ppr_export.drop_duplicates(subset =[1,2, 'Price', 'Date'], 
                     keep = False, inplace = True)
    
    # here we want to select the timeframe between Jan 2014 and December 2019. We include the months.
    time_mask = (ppr_export['Date'] >= '2014-01-01') & (ppr_export['Date'] <= '2019-12-31')
    # here we are interested in identifying the Ulster Province. Therefore we will label the Counties that match our description
    ulster = ['Antrim', 'Armagh', 'Down', 'Fermanagh', 'Londonderry', 'Tyrone', 'Cavan', 'Donegal', 'Monaghan']
    leinster =  ['Carlow', 'Dublin', 'Kildare', 'Kilkenny', 'Laois', 'Longford', 'Louth', 'Meath', 'Offaly', 'Westmeath', 'Wexford' 'Wicklow']
    munster = ['Cork', 'Clare', 'Kerry', 'Limerick', 'Tipperary', 'Waterford']
    connaught = ['Galway', 'Leitrim', 'Mayo', 'Roscommon', 'Sligo'] 
    ppr_export['Province'] = ppr_export['County']
    
    ppr_export.loc[ppr_export['Province'].isin(ulster), 'Province'] = 'Ulster'
    ppr_export.loc[ppr_export['Province'].isin(leinster), 'Province'] = 'Leinster'
    ppr_export.loc[ppr_export['Province'].isin(munster), 'Province'] = 'Munster'
    ppr_export.loc[ppr_export['Province'].isin(connaught), 'Province'] = 'Connaught'
    # here we generate a mask to select the province we are interested in
    province_mask = (ppr_export.loc[ppr_export['County'].isin(ulster), 'County'])
    # here we create the label for second-hand houses
    new_mask = (ppr_export['Description of Property'].str.contains("Second-Hand "))
    # here we exclude houses that are not at market value
    no_market_value = (ppr_export['Not Full Market Price'] == 'No')
    # here we exclude the cases where the transaction amount is less than 10k
    low_transaction = (ppr_export['Price'] > 10000)
    # here we exclude the cases where there is a Vat exemption
    vat_excluded = (ppr_export['VAT Exclusive'] == 'No')

    
    # we apply the masks and drop column that don't need
    ppr_export = ppr_export[time_mask & province_mask & new_mask & no_market_value & low_transaction & vat_excluded]
    ppr_export.drop([0, 1, 2], axis=1, inplace = True)
    
    return ppr_export


ppr_export = ppr_report('Q1_2020_Data_analyst_transactions')



ppr_export.to_csv('ppr_export.csv')


In [11]:
# counties we are interested in 
counties = ['Antrim', 'Armagh', 'Down', 'Fermanagh', 'Londonderry', 'Tyrone', 'Cavan', 'Donegal', 'Monaghan']

def market_report(raw_df,county):
    """
    This function accept the dataframe generated and create the market report with aggregated values
    """
    #add the last day of the month
    
    # total transactions on a rolling basis
    monthly_tr = round(raw_df[raw_df['County']==county].groupby('Month_end_day')['id'].count(), 2).to_frame().reset_index(level=[0])
    semester_tr = round(raw_df[raw_df['County']==county].sort_values(by='Month_end_day').groupby('Month_end_day')['id'].count().rolling(6).sum(), 2).to_frame().reset_index(level=[0])
    yearly_tr = round(raw_df[raw_df['County']==county].sort_values(by='Month_end_day').groupby('Month_end_day')['id'].count().rolling(12).sum(), 2).to_frame().reset_index(level=[0])

    # total sales volume on a rolling basis
    monthly_sales = round(raw_df[raw_df['County']==county].groupby('Month_end_day')['Price'].sum(), 2).to_frame()
    semester_sales = round(raw_df[raw_df['County']==county].sort_values(by='Month_end_day').groupby('Month_end_day')['Price'].sum().rolling(6).sum(), 2).to_frame().reset_index(level=[0])
    yearly_sales = round(raw_df[raw_df['County']==county].sort_values(by='Month_end_day').groupby('Month_end_day')['Price'].sum().rolling(12).sum(), 2).to_frame().reset_index(level=[0])

    # median sales price. Something to note, the resampling starts at the beginning of 2014, thus the first month (Jan) is obtaining the median of only the first month
    p50_3mo_sales = raw_df[raw_df['County'] == county].set_index('Month_end_day').sort_values(by='Month_end_day')['Price'].resample('3M', convention = 'start').median()
    p50_6mo_sales = raw_df[raw_df['County'] == county].set_index('Month_end_day').sort_values(by='Month_end_day')['Price'].resample('6M', convention = 'start').median()
    p50_12mo_sales = raw_df[raw_df['County'] == county].set_index('Month_end_day').sort_values(by='Month_end_day')['Price'].resample('12M', convention = 'start').median()


    # median change (simple delta from the median)
    p50_3mo_sales_chng = p50_3mo_sales.pct_change()
    p50_12mo_sales_chng = p50_12mo_sales.pct_change()

    # percentiiles
    p25_12mo_sales = raw_df[raw_df['County'] == county].set_index('Month_end_day').sort_values(by='Month_end_day')['Price'].resample('12M').quantile(.25)
    p75_12mo_sales = raw_df[raw_df['County'] == county].set_index('Month_end_day').sort_values(by='Month_end_day')['Price'].resample('12M').quantile(.75)
    
    # df that we want to merge, already in the right order
    df_to_merge = [ yearly_tr, semester_tr,monthly_tr,
                   yearly_sales,semester_sales, monthly_sales, 
                   p50_12mo_sales,p50_6mo_sales,p50_3mo_sales, 
                   p50_12mo_sales_chng, p50_3mo_sales_chng,
                    p25_12mo_sales, p75_12mo_sales]

    # here we merge all the df in one go
    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Month_end_day'],
                                            how= 'left'), df_to_merge)
    
    df_merged['County'] = i
    df_merged['Province'] = 'Ulster'
    

    return df_merged



# here we generate an empty container to add all the dataframes concatenated
market_report_df = []

# here we iterate over all the counties in Ulster and stack the different aggregation together
for i in counties:
    df = market_report(ppr_export, i)
    market_report_df.append(df)    
    
market_report_df = pd.concat(market_report_df)

# rename and re-order columns to match the expected output the columns
market_report_df.columns = ['Month_end_day', 
                            'year_tr','semester_tr','monthly_tr',
                            'yearly_sales', 'semester_sales','monthly_sales',
                            'p50_12mo_sales','p50_6mo_sales',  'p50_3mo_sales',
                            'p50_12mo_sales_chng',  'p50_3mo_sales_chng',
                            'p25_12mo_sales','p75_12mo_sales', 'County', 'Province']

market_report_df = market_report_df[['Province','County',  'Month_end_day', 
                            'year_tr','semester_tr','monthly_tr',
                            'yearly_sales', 'semester_sales','monthly_sales',
                            'p50_12mo_sales','p50_6mo_sales',  'p50_3mo_sales',
                            'p50_12mo_sales_chng',  'p50_3mo_sales_chng',
                            'p25_12mo_sales','p75_12mo_sales' ]]

market_report_df_formatted = market_report_df.copy()

# here we want to format the columns with sales amount with the $ or % sign and show in comprehensible units. 
# However, we will generate a separate dataframe that is specific for the report and not for the visualisation
columns_to_format_dollar = ['yearly_sales','semester_sales','monthly_sales','p50_12mo_sales','p50_6mo_sales','p50_3mo_sales', 'p25_12mo_sales','p75_12mo_sales']
columns_to_format_pct = ['p50_12mo_sales_chng', 'p50_3mo_sales_chng']

market_report_df_formatted[columns_to_format_dollar] = market_report_df_formatted[columns_to_format_dollar].astype(float).applymap('${:,.2f}'.format)
market_report_df_formatted[columns_to_format_pct] = market_report_df_formatted[columns_to_format_pct].astype(float).applymap('{:,.2%}'.format)


market_report_df_formatted.to_csv('market_report_df_formatted.csv')
