# Using pretrained model to predict number of damaged houses by Haima



In [22]:
%pylab inline
import cPickle
from sklearn import metrics
from sklearn.ensemble import  RandomForestRegressor
import pandas as pd
import seaborn


Populating the interactive namespace from numpy and matplotlib


# Steps

#### 1. Load census and weather data, manipulate data as required by the model
        * fill empty poverty data with mean value
        * calculate number of poor people per each municipality
#### 2. Load model 
#### 3. Use model to predict number of houses damaged
#### 4. Calculate priorites based on predictions

## Load data

In [12]:
T = pd.read_csv("./Matrix-Typhoon-Haima.csv")
T.columns


Index([u'ID', u'OBJECTID', u'ISO', u'P_Code', u'M_Code', u'Municipality',
       u'Area (km2)', u'Avg. Elevation (m)', u'Perimeter (m)',
       u'Coastline length (m)', u'C/P Ratio',
       u'Distance to coastline (m. mean)', u'Distance to coastline (m. stdev)',
       u'Ruggedness index (mean)', u'Ruggedness index (stdev)',
       u'Slope (mean)', u'Slope (stdev)', u'Landuse (Most common)',
       u'Population 2010 census', u'Population 2013 est.',
       u'Population 2015 census', u'Pop. Density 2015 per km2', u'Poverty (%)',
       u'Avg. Windspeed (km/h)', u'Distance from typhoon path (km)',
       u'Area Flooded (%)', u'Rainfallme', u'Surge Height int. (m)',
       u'Surge risk class (REACH)', u'People affected',
       u'People affected (% 2010)', u'Houses damaged', u'rain_oct18-20',
       u'rain oct11-18'],
      dtype='object')

In [13]:
T.head(3)

Unnamed: 0,ID,OBJECTID,ISO,P_Code,M_Code,Municipality,Area (km2),Avg. Elevation (m),Perimeter (m),Coastline length (m),...,Distance from typhoon path (km),Area Flooded (%),Rainfallme,Surge Height int. (m),Surge risk class (REACH),People affected,People affected (% 2010),Houses damaged,rain_oct18-20,rain oct11-18
0,,1,PHL,PH010000000,PH012801000,ADAMS,111.185,,,,...,31.472899,,340.382953,,,,,,258.883747,81.499207
1,,2,PHL,PH010000000,PH012802000,BACARRA,55.346,,,,...,0.347623,,280.685093,,,,,,189.430589,91.254504
2,,3,PHL,PH010000000,PH012803000,BADOC,80.758,,,,...,41.425407,,379.571313,,,,,,268.170693,111.40062


#### Add estimates of number of poor people

In [14]:

T.index = T['M_Code']
T[['Poverty (%)']] = T[['Poverty (%)']].fillna(T[['Poverty (%)']].mean())
T['Poor'] = T['Poverty (%)'] * T['Population 2015 census']


#### Fill nans with mean Rainfallme



In [18]:
T[['Rainfallme']] = T[['Rainfallme']].fillna(T[['Rainfallme']].mean())

#### Select only data that were used in the model

In [15]:
X = T[[u'Area (km2)', u'Population 2015 census',
       u'Pop. Density 2015 per km2', u'Poverty (%)', u'Rainfallme',  u'Avg. Windspeed (km/h)', u'Distance from typhoon path (km)', 'Poor']]


print X.shape

(466, 8)


## Load and use model

In [35]:
with open('Model_Hayian_Melor_Hagupit_Ramasun', 'rb') as f:
    rf = cPickle.load(f)
    
    ### Use model for prediction
    print str(rf)
    predicted = rf.predict(X)
    
    X['Predicted Houses damaged'] = predicted

    


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='log2', max_leaf_nodes=None, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
X.head()

Unnamed: 0_level_0,Area (km2),Population 2015 census,Pop. Density 2015 per km2,Poverty (%),Rainfallme,Avg. Windspeed (km/h),Distance from typhoon path (km),Poor,Predicted Houses damaged
M_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
PH012801000,111.185,1792,16.117282,10.100446,340.382953,120.0,31.472899,18100.000003,1600.720732
PH012802000,55.346,32215,582.065551,2.070464,280.685093,110.0,0.347623,66700.000015,2274.781759
PH012803000,80.758,31616,391.490626,4.991144,379.571313,90.017524,41.425407,157800.00001,1203.482598
PH012804000,115.126,14672,127.442975,3.748637,369.103536,121.306362,28.007901,54999.999995,1670.510556
PH012805000,158.252,55201,348.817077,3.224579,315.54523,99.47106,24.868173,178000.000007,2688.947292


#### Describe predictions

In [43]:
X['Predicted Houses damaged'].describe()

count     466.000000
mean      602.413910
std       868.439794
min        81.717919
25%       146.317959
50%       246.742985
75%       552.079075
max      6433.620323
Name: Predicted Houses damaged, dtype: float64

## Calculate Priority Index based on predicted numbers of houses damaged

In [56]:
# percentiles [0, 0.2, 0.4, 0.6, 0.8]

percentiles = [0, 0.35, 0.65, 0.85, 0.95]
thresholds = []



In [57]:
for i in range(5): 
    thresholds.append(X['Predicted Houses damaged'].quantile(percentiles[i])) 
print 'thresholds', [int(t) for t in thresholds]
print 'percentiles', [str(int(p*100)) + '%' for p in percentiles]

thresholds [81, 183, 345, 1205, 2498]
percentiles ['0%', '35%', '65%', '85%', '95%']


In [58]:
def prioritize(v): 
    for i in range(len(thresholds))[::-1]: 
        if  v> thresholds[i]:
            return i+1
    return i+1

In [59]:
X['Predicted priority'] = X['Predicted Houses damaged'].map(prioritize)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


#### Number of municipalities per priority

In [60]:
X['Predicted priority'].value_counts()

1    163
2    140
3     93
4     46
5     24
Name: Predicted priority, dtype: int64

## Save predictions

In [61]:
X.to_csv("./Matrix-Typhoon-Haima-with-predictions.csv")

In [62]:
X

Unnamed: 0_level_0,Area (km2),Population 2015 census,Pop. Density 2015 per km2,Poverty (%),Rainfallme,Avg. Windspeed (km/h),Distance from typhoon path (km),Poor,Predicted Houses damaged,Predicted priority
M_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PH012801000,111.185,1792,16.117282,10.100446,340.382953,120.000000,31.472899,18100.000003,1600.720732,4
PH012802000,55.346,32215,582.065551,2.070464,280.685093,110.000000,0.347623,66700.000015,2274.781759,4
PH012803000,80.758,31616,391.490626,4.991144,379.571313,90.017524,41.425407,157800.000010,1203.482598,3
PH012804000,115.126,14672,127.442975,3.748637,369.103536,121.306362,28.007901,54999.999995,1670.510556,4
PH012805000,158.252,55201,348.817077,3.224579,315.545230,99.471060,24.868173,178000.000007,2688.947292,5
PH012806000,137.204,9777,71.258855,3.917357,385.750643,123.475974,22.779199,38299.999995,1711.632771,4
PH012807000,173.135,1567,9.050741,3.956605,356.545834,113.323651,5.382585,6200.000001,2111.257535,4
PH012808000,33.405,12184,364.735818,3.373276,357.625271,98.309316,31.197809,41099.999999,1249.474271,4
PH012809000,108.434,38562,355.626464,4.377366,319.558988,102.556499,15.163659,168799.999993,2964.250000,5
PH012810000,66.780,2947,44.129979,2.409230,356.303761,120.000000,30.422022,7100.000000,1527.236620,4
