In [17]:
import numpy as np
import pandas as pd
import re


In [18]:
def extract_features(lines):     # returns a dictionary , takes a list of sentsences(each house)
    features = dict()
    for line in lines:
        if 'House ID' in line: #substirg matching
            features['House ID'] = line.split()[-1]
            continue
        if 'Date Built' in line and 'Date Priced' in line:
            date_built, date_priced = line.split('and')
            features['Date Built'] = date_built.split(': ')[-1].strip()  #removes white spaces at teh 
            features['Date Priced'] = date_priced.split(': ')[-1].strip()
            continue
        if 'garden' in line:
            if 'no space' in line:
                features['garden'] = 0
                continue
            if 'beautiful' in line:
                features['garden'] = 1
                continue
        if 'Distance from' in line:
            place,distance = line.replace(' holy lights','').split(' is ')
            features[place] = float(distance)
            continue
        if 'dining rooms' in line:
            features['dining rooms'] = int(line.replace('There are ','').split()[0])
            continue
        if 'bedrooms' in line:
            features['bedrooms'] = int(line.replace('There are ','').split()[0])
            continue
        if 'bathrooms' in line:
            features['bathrooms'] = int(line.replace('There are ','').split()[0])
            continue
        if 'King' in line and ('visit' in line or 'Visited' in line):
            if 'couldn\'t' in line:
                features['King visited'] = 0
                continue
            else:
                features['King visited'] = 1
                continue
        if 'curse' in line:
            if 'cursed' in line:
                features['cursed'] = 1
                continue
            else:
                features['cursed'] = 0
                continue
        if 'King blessed the house with ' in line:
            features['blessings'] = int(line.replace('King blessed the house with ','').replace(' blessings',''))
            continue
        if 'land of farm' in line:
            if 'huge' in line:
                features['farm'] = 'huge'
                continue
            elif 'small' in line:
                features['farm'] = 'small'
                continue
            elif 'no land' in line:
                features['farm'] = 'no land'
                continue
        if 'Location of the house' in line:
            features['Location'] = line.split(': ')[-1].strip()
            continue
        if 'Holy tree'in line:
            if 'stands' in line:
                features['Holy tree'] = 1
                continue
            elif 'cut' in line:
                features['Holy tree'] = -1
                continue
        if 'Distance from Knight\'s house is ' in line:
            features['distance from Knight house'] = float(line.replace('Distance from Knight\'s house is ','').replace(' holy lights',''))
            continue
        if 'renovation' in line:
            if 'did not' in line:
                features['renovation'] = 0
                continue
            elif 'underwent' in line:
                features['renovation'] = 1
                continue
    return features

In [19]:
def extract_data(filename):
    text = open(filename,'r').read()      #second parameter is for read, write(ref - google file handling in python)
    data = pd.DataFrame()
    count = 0
    houses = text.split('\n\n')
    for house_details in houses:           #take one house at a time, get features and put in the dataset
        features = extract_features(house_details.split('\n'))# split function returns a list of lines
        for key in features.keys():
            data.loc[count,key] = features[key]
        count += 1
    return data     

In [20]:
def getAllData(filenames):
    allData = pd.DataFrame()
    for filename in filenames:
        print(filename)
        data = extract_data(filename)
        allData = pd.concat([allData,data])
    return allData    

In [21]:
filenames = ['The-Kings.txt','The-Greens.txt','The-Lannisters.txt','Bright-Brothers.txt',
             'Masters-of-Stones.txt','The-Ollivers.txt','The-Overlords.txt','The-Starks.txt',
             'Wood-Priests.txt','Bob.txt','Not-Known.txt']
print ("This will take around 10 minutes or so")
allData = getAllData(filenames)

This will take around 10 minutes or so
The-Kings.txt
The-Greens.txt
The-Lannisters.txt
Bright-Brothers.txt
Masters-of-Stones.txt
The-Ollivers.txt
The-Overlords.txt
The-Starks.txt
Wood-Priests.txt
Bob.txt
Not-Known.txt


In [22]:
allData.head()
allData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 1392
Data columns (total 20 columns):
Date Built                      20000 non-null object
Date Priced                     20000 non-null object
Distance from Capital           18985 non-null float64
Distance from Guarding Tower    19366 non-null float64
Distance from Knight's house    18985 non-null float64
Distance from Royal Market      17175 non-null float64
Distance from the Dock          17980 non-null float64
Distance from the River         20000 non-null float64
Holy tree                       16030 non-null float64
House ID                        20000 non-null object
King visited                    20000 non-null float64
Location                        17963 non-null object
bathrooms                       18834 non-null float64
bedrooms                        19225 non-null float64
blessings                       20000 non-null float64
cursed                          18778 non-null float64
dining rooms    

In [23]:
allData.to_csv('allData.csv',index=False)

In [24]:
allData = pd.read_csv('allData.csv')
housePrices = pd.read_csv('house-prices.csv')
missing = pd.read_csv('missing.csv')

In [25]:
allData.head()

Unnamed: 0,Date Built,Date Priced,Distance from Capital,Distance from Guarding Tower,Distance from Knight's house,Distance from Royal Market,Distance from the Dock,Distance from the River,Holy tree,House ID,King visited,Location,bathrooms,bedrooms,blessings,cursed,dining rooms,farm,garden,renovation
0,2/10/1603 8:51 PM,6/5/1611 11:44 PM,19.885265,22.395136,29.921162,92.698917,79.72455,8.059939,,6e32cece,0.0,The Mountains,2.0,2.0,140.0,0.0,2.0,small,0.0,0.0
1,2/26/1600 3:13 PM,3/10/1610 2:23 PM,25.106037,24.82466,36.084848,141.942752,145.034612,21.44816,1.0,6e32cf20,0.0,The Mountains,4.0,3.0,135.0,0.0,4.0,small,0.0,0.0
2,9/19/1606 11:55 PM,7/26/1608 5:17 AM,87.174747,59.149911,98.032248,16.476015,54.645921,2.980832,0.0,6e32cf76,0.0,Cursed Land,3.0,2.0,73.0,1.0,2.0,small,0.0,0.0
3,9/10/1602 1:18 AM,10/10/1612 2:47 PM,111.965941,72.036244,122.889439,99.671185,151.105728,9.879558,1.0,6e32cf7c,0.0,Servant's Premises,2.0,2.0,48.0,0.0,3.0,small,0.0,0.0
4,3/15/1605 11:16 PM,1/19/1612 7:39 AM,27.953048,7.977065,38.774105,50.869864,11.666815,13.758185,,6e32cf80,0.0,The Mountains,3.0,4.0,132.0,0.0,3.0,small,0.0,0.0


In [26]:
def extract_year(date):
    date = date.split()[0]
    return int(date.split('/')[-1])

#print allData.columns
allData['Date Built'] = allData['Date Built'].apply(extract_year).apply(int).apply(lambda x: int())
allData['Date Priced'] = allData['Date Priced'].apply(extract_year).apply(int)

allData

Unnamed: 0,Date Built,Date Priced,Distance from Capital,Distance from Guarding Tower,Distance from Knight's house,Distance from Royal Market,Distance from the Dock,Distance from the River,Holy tree,House ID,King visited,Location,bathrooms,bedrooms,blessings,cursed,dining rooms,farm,garden,renovation
0,0,1611,19.885265,22.395136,29.921162,92.698917,79.724550,8.059939,,6e32cece,0.0,The Mountains,2.0,2.0,140.0,0.0,2.0,small,0.0,0.0
1,0,1610,25.106037,24.824660,36.084848,141.942752,145.034612,21.448160,1.0,6e32cf20,0.0,The Mountains,4.0,3.0,135.0,0.0,4.0,small,0.0,0.0
2,0,1608,87.174747,59.149911,98.032248,16.476015,54.645921,2.980832,0.0,6e32cf76,0.0,Cursed Land,3.0,2.0,73.0,1.0,2.0,small,0.0,0.0
3,0,1612,111.965941,72.036244,122.889439,99.671185,151.105728,9.879558,1.0,6e32cf7c,0.0,Servant's Premises,2.0,2.0,48.0,0.0,3.0,small,0.0,0.0
4,0,1612,27.953048,7.977065,38.774105,50.869864,11.666815,13.758185,,6e32cf80,0.0,The Mountains,3.0,4.0,132.0,0.0,3.0,small,0.0,0.0
5,0,1612,107.687755,122.390194,118.003414,56.151360,62.104754,77.656929,1.0,6e32cfa3,0.0,The Mountains,3.0,4.0,52.0,,3.0,small,0.0,
6,0,1610,4.460661,123.568064,15.207348,12.904976,26.174476,43.261003,1.0,6e32cfb9,0.0,The Mountains,2.0,3.0,156.0,0.0,3.0,huge,0.0,0.0
7,0,1607,33.914079,129.504031,44.651891,26.547489,29.348649,21.441570,1.0,6e32d01c,0.0,The Mountains,4.0,2.0,126.0,0.0,3.0,small,0.0,0.0
8,0,1610,113.152269,31.530238,123.253774,72.557943,21.926644,17.352285,1.0,6e32d026,0.0,,4.0,3.0,47.0,0.0,3.0,huge,0.0,0.0
9,0,1610,25.960131,135.664742,36.170315,87.189539,107.670121,12.397643,1.0,6e32d043,0.0,Cursed Land,4.0,2.0,134.0,0.0,3.0,small,0.0,


Unnamed: 0,Date Built,Date Priced,Distance from Capital,Distance from Guarding Tower,Distance from Knight's house,Distance from Royal Market,Distance from the Dock,Distance from the River,Holy tree,House ID,King visited,Location,bathrooms,bedrooms,blessings,cursed,dining rooms,farm,garden,renovation
0,1611,1611,19.885265,22.395136,29.921162,92.698917,79.72455,8.059939,,6e32cece,0.0,The Mountains,2.0,2.0,140.0,0.0,2.0,small,no space,0.0
1,1610,1610,25.106037,24.82466,36.084848,141.942752,145.034612,21.44816,stands beside,6e32cf20,0.0,The Mountains,4.0,3.0,135.0,0.0,4.0,small,no space,0.0
2,1608,1608,87.174747,59.149911,98.032248,16.476015,54.645921,2.980832,cut by Ancient witch,6e32cf76,0.0,Cursed Land,3.0,2.0,73.0,1.0,2.0,small,no space,0.0
3,1612,1612,111.965941,72.036244,122.889439,99.671185,151.105728,9.879558,stands beside,6e32cf7c,0.0,Servant's Premises,2.0,2.0,48.0,0.0,3.0,small,no space,0.0
4,1612,1612,27.953048,7.977065,38.774105,50.869864,11.666815,13.758185,,6e32cf80,0.0,The Mountains,3.0,4.0,132.0,0.0,3.0,small,no space,0.0
