In [1]:
from pymongo.mongo_client import MongoClient
import os
from dotenv import load_dotenv
import sys

In [2]:
os.chdir("..")

In [3]:
load_dotenv()
uri = os.getenv("MONGO_DB_LINK")

In [4]:
client = MongoClient(uri)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [5]:
db = client.forest_db

In [6]:
forest_collection = db["forest_collection"]

In [7]:
import pandas as pd

In [48]:
df = pd.read_csv("data/covtype.csv")

In [None]:
df.shape

In [9]:
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [None]:
forest_collection.insert_many(df.to_dict(orient="records"))

Reducing the size of number of columns of the dataframe such that it only have 12 features

In [53]:
wild_columns = df.iloc[:,10:14].columns
df["Wilderness_Area"] =(df.iloc[:,10:14] == 1).idxmax(1)
wild_columns

Index(['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4'],
      dtype='object')

In [54]:
soil_column = df.iloc[:,14:54].columns
df["Soil_Type"] =(df.iloc[:,14:54] == 1).idxmax(1)
soil_column

Index(['Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40'],
      dtype='object')

In [55]:
df['Wilderness_Area'] = df['Wilderness_Area'].apply(lambda x: int(x[-1]))

In [56]:
df["Wilderness_Area"].unique()

array([1, 3, 4, 2])

In [57]:
df.drop(columns=wild_columns, inplace=True)

In [59]:
def get_end_number(soil_type: str):
    head = soil_type.rstrip("1234567890")
    tail = soil_type[len(head):]
    return int(tail)    

In [60]:
df['Soil_Type'] = df['Soil_Type'].apply(get_end_number)

In [65]:
df.drop(columns=soil_column, inplace=True)

In [66]:
df.shape

(581012, 13)

In [67]:
forest_collection.insert_many(df.to_dict(orient="records"))

<pymongo.results.InsertManyResult at 0x2ff068430>

In [74]:
# Load data from MongoDB into a DataFrame
data_from_db = forest_collection.find()
df_loaded = pd.DataFrame(list(data_from_db))

# Optionally drop the MongoDB generated '_id' column
df_loaded = df_loaded.drop(columns='_id')

print(df_loaded.columns)


Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Cover_Type', 'Wilderness_Area',
       'Soil_Type'],
      dtype='object')


In [75]:
one_hot_wild = pd.get_dummies(df_loaded['Wilderness_Area'], prefix='Wilderness_Area', prefix_sep='')
one_hot_wild = one_hot_wild.astype(int)
one_hot_soil = pd.get_dummies(df_loaded['Soil_Type'], prefix='Soil_Type', prefix_sep='')
one_hot_soil = one_hot_soil.astype(int)
df_loaded.drop(columns=['Wilderness_Area', 'Soil_Type'], inplace=True)
df_loaded[one_hot_soil.columns] = one_hot_soil
df_loaded[one_hot_wild.columns] = one_hot_wild

In [76]:
df_loaded.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Cover_Type', 'Soil_Type1',
       'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6',
       'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11',
       'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15',
       'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19',
       'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23',
       'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27',
       'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31',
       'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35',
       'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
       'Soil_Type40', 'Wilderness_Area1', 'Wilderness_Area2',
       'Wilderness_Area3', 'Wi

In [2]:
import os

In [3]:
os.chdir("..")

In [4]:
from src.forestCover.data_access.forest_data import ForestData

data = ForestData()
df = data.export_collection_as_df()
df.columns

ModuleNotFoundError: No module named 'forestCover'