In [10]:
import numpy as np
import pandas as pd
import re

In [11]:
def extract_features(lines):
    features = dict()
    for line in lines:
        if 'House ID' in line:
            features['House ID'] = line.split()[-1]
            continue
        if 'Date Built' in line and 'Date Priced' in line:
            date_built, date_priced = line.split('and')
            features['Date Built'] = line.split(': ')[-1].strip()
            features['Date Priced'] = line.split(': ')[-1].strip()
            continue
        if 'garden' in line:
            if 'no space' in line:
                features['garden'] = 0
                continue
            if 'beautiful' in line:
                features['garden'] = 1
                continue
        if 'Distance from' in line:
            place,distance = line.replace(' holy lights','').split(' is ')
            features[place] = float(distance)
            continue
        if 'dining rooms' in line:
            features['dining rooms'] = int(line.replace('There are ','').split()[0])
            continue
        if 'bedrooms' in line:
            features['bedrooms'] = int(line.replace('There are ','').split()[0])
            continue
        if 'bathrooms' in line:
            features['bathrooms'] = int(line.replace('There are ','').split()[0])
            continue
        if 'King' in line and ('visit' in line or 'Visited' in line):
            if 'couldn\'t' in line:
                features['King visited'] = 0
                continue
            else:
                features['King visited'] = 1
                continue
        if 'curse' in line:
            if 'cursed' in line:
                features['cursed'] = 1
                continue
            else:
                features['cursed'] = 0
                continue
        if 'King blessed the house with ' in line:
            features['blessings'] = int(line.replace('King blessed the house with ','').replace(' blessings',''))
            continue
        if 'land of farm' in line:
            if 'huge' in line:
                features['farm'] = 'huge'
                continue
            elif 'small' in line:
                features['farm'] = 'small'
                continue
            elif 'no land' in line:
                features['farm'] = 'no land'
                continue
        if 'Location of the house' in line:
            features['Location'] = line.split(': ')[-1].strip()
            continue
        if 'Holy tree'in line:
            if 'stands' in line:
                features['Holy tree'] = 1
                continue
            elif 'cut' in line:
                features['Holy tree'] = -1
                continue
        if 'Distance from Knight\'s house is ' in line:
            features['distance from Knight house'] = float(line.replace('Distance from Knight\'s house is ','').replace(' holy lights',''))
            continue
        if 'renovation' in line:
            if 'did not' in line:
                features['renovation'] = 0
                continue
            elif 'underwent' in line:
                features['renovation'] = 1
                continue
    return features

In [12]:
def extract_data(filename):
    text = open(filename,'r').read()
    data = pd.DataFrame()
    count = 0
    houses = text.split('\n\n')
    for house_details in houses:
        features = extract_features(house_details.split('\n'))
        for key in features.keys():
            data.loc[count,key] = features[key]
        count += 1
    return data     

In [13]:
def getAllData(filenames):
    allData = pd.DataFrame()
    for filename in filenames:
        print filename
        data = extract_data(filename)
        allData = pd.concat([allData,data])
    return allData    

In [None]:
filenames = ['The-Kings.txt','The-Greens.txt','The-Lannisters.txt','Bright-Brothers.txt',
             'Masters-of-Stones.txt','The-Ollivers.txt','The-Overlords.txt','The-Starks.txt',
             'Wood-Priests.txt','Bob.txt','Not-Known.txt']
print "This will take around 10 minutes or so"
allData = getAllData(filenames)

This will take around 10 minutes or so
The-Kings.txt
The-Greens.txt
The-Lannisters.txt
Bright-Brothers.txt
Masters-of-Stones.txt
The-Ollivers.txt
The-Overlords.txt
The-Starks.txt


In [9]:
allData.head()
allData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 1392
Data columns (total 19 columns):
Date Built                      20000 non-null object
Date Priced                     20000 non-null object
Distance from Capital           18985 non-null float64
Distance from Guarding Tower    19366 non-null float64
Distance from Knight's house    18985 non-null float64
Distance from Royal Market      17175 non-null float64
Distance from the Dock          17980 non-null float64
Distance from the River         20000 non-null float64
Holy tree                       16030 non-null object
House ID                        20000 non-null object
King visited                    20000 non-null float64
Location                        17963 non-null object
bathrooms                       18834 non-null float64
bedrooms                        19225 non-null float64
blessings                       20000 non-null float64
cursed                          15960 non-null float64
dining rooms     

In [7]:
allData.to_csv('allData.csv',index=False)