In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns

%matplotlib inline

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
names = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
        'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
         'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']

wilderness_areas = [1,2,3,4]
soil_types = [2702,2703,2704,2705,2706,2717,3501,3502,4201,4703,4704,4744,4758,5101,5151,6101,6102,6731,7101,7102,7103,7201,7202,7700,7701,7702,7709,7710,7745,7746,7755,7756,7757,7790,8703,8707,8708,8771,8772,8776]

names.extend(wilderness_areas)
names.extend(soil_types)

names.append('Cover_Type')

In [None]:
#Read the Data set
data_set = pd.read_csv('covtype.data', names=names)

#Get dummies back to categorical
wilderness_area_cat = data_set[wilderness_areas].stack(-1)
wilderness_area_cat = wilderness_area_cat.where(wilderness_area_cat == 1).dropna()
data_set['Wilderness_Area'] = wilderness_area_cat.reset_index()['level_1']
data_set.drop(wilderness_areas, inplace=True, axis=1)

soil_types_cat = data_set[soil_types].stack(-1)
soil_types_cat = soil_types_cat.where(soil_types_cat == 1).dropna()
data_set['Soil_Type'] = soil_types_cat.reset_index()['level_1']
data_set.drop(soil_types, inplace=True, axis=1)

In [None]:
#Elevation exploration
sns.violinplot(x='Cover_Type', y='Elevation', data = data_set,  inner="quart")

In [None]:
#It looks like there are 3 groups here. [{1,2,5}, {3,4,6}, {7}]
#I will categorize the elevation in those 3 so I will get the intervals.

groups = {'g1': [1,2,5], 'g2': [3,4,6], 'g3': [7]}
groups_stats = {}
for g_name, g_vals in groups.items():
    print(g_name)
    groups_stats[g_name] = data_set[data_set['Cover_Type'].isin(g_vals)]['Elevation'].describe()
    display(groups_stats[g_name])

In [None]:
#The minimum elevation value is 1859 m and the maximum is 3858, but it seems that in group 3 (where the max value is)
#Are not many values above 75 percentile as we can see in the violin plot (also with the std, mean, and 75 percentile)
#So to set the maximum value I will add 2 stds to the mean of group 3 and that will be my max.

min_elev_value = groups_stats['g2']['min']
max_elev_value = groups_stats['g3']['mean'] + groups_stats['g3']['std']*2
#max_elev_value = 3858
interval_size = (max_elev_value - min_elev_value)/3

elev_intervals = {i:(int(min_elev_value+(interval_size)*i), int(min_elev_value+(interval_size)*(i+1))) for i in range(3)}

In [None]:
# test = data_set
# test['elev_cat_test'] = pd.qcut(data_set['Elevation'], 4)
# test.groupby(['elev_cat_test', 'Cover_Type']).mean().dropna()

In [None]:
#This mapping leaves 1100 samples with a category 3, means outlier.

def map_elevation(val):
    for k, interval in elev_intervals.items(): 
        if (val >= interval[0] and val < interval[1]):
            return k
    return 3
        
data_set['Elevation_Cat'] = data_set['Elevation'].apply(map_elevation)

In [None]:
#We can see that each Wilderness_Area has some typical clases but more important is that the elevation category is helping
#To differentiate those typical clases.
#But there is still some hard cases to classify, for example Cover_Type 5 in the Area 3, Elevation 1, has around 60% of its
#Samples but Cover 2 will get clasified a lot, which means we need another variable to explain cover 5's presence.

pd.crosstab([data_set['Wilderness_Area'], data_set['Elevation_Cat']],
            data_set['Cover_Type'],margins=True).style.background_gradient(cmap='summer_r')

In [None]:
sns.heatmap(data_set.corr(),annot=True,cmap='RdYlGn',linewidths=0.2) 
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

In [None]:
data_set['Total_Distance_To_Hydrology'] = np.sqrt(data_set['Horizontal_Distance_To_Hydrology'] ** 2 + 
                                                  data_set['Vertical_Distance_To_Hydrology'] ** 2)
data_set['Total_Distance_To_Hydrology_Cat'] = pd.qcut(data_set['Total_Distance_To_Hydrology'], 4)

In [None]:
#The shade by itself it is not telling us much
data_set['Total_Hillshade_Day'] = data_set['Hillshade_3pm'] + data_set['Hillshade_9am'] + data_set['Hillshade_Noon']
sns.violinplot(x='Cover_Type', y='Total_Hillshade_Day', data = data_set,  inner="quart")

In [None]:
#These forms are very similar but there are some points typical of each Cover Type which could mean a species is more
#Likely to survive in an specific configuration.
sns.lmplot(x="Slope", y="Total_Hillshade_Day", hue="Cover_Type", data=data_set, col='Cover_Type')

In [None]:
sns.lmplot(y="Total_Distance_To_Hydrology", x="Horizontal_Distance_To_Roadways", hue="Cover_Type", data=data_set, col='Cover_Type')
