In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

In [27]:
df = pd.read_csv("Modelar_UH2020.txt", sep = "|")
df.head()

Unnamed: 0,ID,X,Y,Q_R_4_0_0,Q_R_4_0_1,Q_R_4_0_2,Q_R_4_0_3,Q_R_4_0_4,Q_R_4_0_5,Q_R_4_0_6,...,Q_NIR_8_1_0,AREA,GEOM_R1,GEOM_R2,GEOM_R3,GEOM_R4,CONTRUCTIONYEAR,MAXBUILDINGFLOOR,CADASTRALQUALITYID,CLASE
0,35984B9C3E7CD9A1,2207357872,165920300,0.0,443.0013,616.001697,746.998401,872.996472,1009.000946,1159.002319,...,10951.926645,144.4269,0.557237,0.067249,0.057372,0.853127,2002,0.0,2,RESIDENTIAL
1,F9D04BF6D037F8FB,2189757160,165463267,5.9e-05,443.899011,627.99906,770.001611,904.999988,1032.998474,1165.001636,...,7048.367637,38.34255,0.709884,0.125156,0.147929,1.181953,1949,1.0,8,RESIDENTIAL
2,B89D5711AFF8C423,2240147335,165690752,0.0,353.502274,523.003601,644.001831,760.997131,876.999634,1006.997498,...,7013.073271,108.794384,0.517702,0.058268,0.081666,1.401552,1986,1.0,5,RESIDENTIAL
3,1C3478AC1522E7E4,2227146459,165934099,0.0,268.000613,376.999609,478.003784,575.001233,683.997742,809.005994,...,6216.880538,155.224455,0.450871,0.053591,0.054201,1.011382,1999,1.0,2,RESIDENTIAL
4,4D12AA5009064345,2212350459,165681791,0.0,318.99791,492.003845,632.999634,757.002197,882.999908,1019.008911,...,7092.767616,1789.873366,0.458819,0.012858,0.019936,1.550478,1966,8.0,6,RESIDENTIAL


In [28]:
df.dropna(inplace = True)
df.isnull().sum().max()

df = df[df["CADASTRALQUALITYID"]!="A"]
df = df[df["CADASTRALQUALITYID"]!="B"]
df = df[df["CADASTRALQUALITYID"]!="C"]

df.drop(df[["ID", "CADASTRALQUALITYID"]], axis = "columns", inplace = True)

In [29]:
factor = pd.factorize(df['CLASE'])
df.CLASE = factor[0]
definitions = factor[1]

In [30]:
definitions

Index(['RESIDENTIAL', 'INDUSTRIAL', 'PUBLIC', 'OFFICE', 'OTHER', 'RETAIL',
       'AGRICULTURE'],
      dtype='object')

In [31]:
X = df.iloc[:, 0:53]
Y = df.iloc[:, -1]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 21)

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [35]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to Iris-setosa, Iris-versicolor and Iris-virginica
reversefactor = dict(zip(range(7),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Species'], colnames=['Predicted Species']))

Predicted Species  AGRICULTURE  INDUSTRIAL  OFFICE  OTHER  PUBLIC  \
Actual Species                                                      
AGRICULTURE                 36          18       0      0       4   
INDUSTRIAL                   5         802      48      6      22   
OFFICE                       1          92      70      7      20   
OTHER                        0          13       5    134      37   
PUBLIC                       4          40      10     30     169   
RESIDENTIAL                  7          71      21     26      87   
RETAIL                       0          27      15      6      21   

Predicted Species  RESIDENTIAL  RETAIL  
Actual Species                          
AGRICULTURE                 36       1  
INDUSTRIAL                 421      24  
OFFICE                     357       7  
OTHER                      200       5  
PUBLIC                     648       9  
RESIDENTIAL              26810      22  
RETAIL                     388      93  


In [37]:
print(list(zip(df.columns[0:53], classifier.feature_importances_)))
joblib.dump(classifier, 'randomforestmodel.pkl') 

[('X', 0.0676523011242738), ('Y', 0.06853241019778487), ('Q_R_4_0_0', 0.002910936924103203), ('Q_R_4_0_1', 0.007348849816646756), ('Q_R_4_0_2', 0.004781528109852162), ('Q_R_4_0_3', 0.0036584753346321254), ('Q_R_4_0_4', 0.0036651938889058827), ('Q_R_4_0_5', 0.005447816768987527), ('Q_R_4_0_6', 0.004790999814773608), ('Q_R_4_0_7', 0.009965282864497391), ('Q_R_4_0_8', 0.0030999796025745416), ('Q_R_4_0_9', 0.004212523811167199), ('Q_R_4_1_0', 0.006320096415437321), ('Q_G_3_0_0', 0.00456248997623309), ('Q_G_3_0_1', 0.008156368778161), ('Q_G_3_0_2', 0.004593490190209813), ('Q_G_3_0_3', 0.004835060423768051), ('Q_G_3_0_4', 0.003521806965108736), ('Q_G_3_0_5', 0.00471725595898491), ('Q_G_3_0_6', 0.0039047655533514785), ('Q_G_3_0_7', 0.006556269621087479), ('Q_G_3_0_8', 0.008041484016560603), ('Q_G_3_0_9', 0.019533543215892138), ('Q_G_3_1_0', 0.005116825911988686), ('Q_B_2_0_0', 0.00485848049416268), ('Q_B_2_0_1', 0.0043176433222884576), ('Q_B_2_0_2', 0.004825905469184824), ('Q_B_2_0_3', 0.0065

['randomforestmodel.pkl']