In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymongo
import os ,sys 
from sklearn.preprocessing import LabelEncoder

if os.path.abspath('../') not in sys.path:
    sys.path.insert(0, os.path.abspath('../'))

In [None]:
def connect_to_mongodb(database_username, database_password, database_cluster_name, database_name):
    connection_string = f"mongodb+srv://{database_username}:{database_password}@{database_cluster_name}/?retryWrites=true&w=majority"
    client = pymongo.MongoClient(connection_string)
    
    # Check if the connection was successful
    if client.server_info():
        print("Connected to MongoDB successfully!")

    db = client[database_name]
    return db

def fetch_data_from_mongodb(db, collection_name):
    collection = db[collection_name]
    cursor = collection.find({})
    data_list = list(cursor)
    return pd.DataFrame(data_list)


In [None]:
def clean_data(vndata):
    if any(vndata.index.duplicated()):
        vndata = vndata[~vndata.index.duplicated(keep='first')]

    vndata = vndata.loc[:, ~vndata.columns.duplicated(keep='first')]
    vndata = vndata.drop(['_id'], axis=1)
    
    vndata.rename(columns={'AREA OF LAND (Thous. ha)': 'AREA OF LAND',
                           'POPULATION (Thous. pers.)': 'Population',
                           'POPULATION DENSITY (Person/km2)': 'population density',
                           'At current prices (Bill. dongs)': 'GROSS REGIONAL DOMESTIC PRODUCT',
                           'State budget revenue (Bill. dongs)': 'STATE BUDGET REVENUE',
                           'State budget expenditure (Bill. dongs)': 'STATE BUDGET EXPENDITURE',
                           'Investment at current prices (Bill. dongs)': 'INVESTMENT AT CURRENT PRICES',
                           'Number of farms': 'NUMBER OF FARM',
                           'Planted area of cereals (Thous. ha)': 'PLANTED AREA OF CEREALS',
                           'Production of fishery (Ton)': 'PRODUCTION OF FISHERY',
                           'Index of industrial production (%)': 'INDEX OF INDUSTRIAL PRODUCTION',
                           'Retail sales of goods at current prices (Bill. dongs)': 'RETAIL SALES OF GOODS',
                           'Number of schools (School)': 'NUMBER OF SCHOOLS',
                           'Number of medical establishments (Esta.)': 'NUMBER OF MEDICAL ESTABLISHMENTS',
                           'carbon_gross_emissions': 'CARBON GROSS EMISSIONS',
                           'tc_loss_ha': 'TROPICAL FOREST LOSS',
                           'FEELS_LIKE': 'FEELS LIKE',
                           'TEMP_MIN': 'TEMP MIN',
                           'TEMP_MAX': 'TEMP MAX'}, inplace=True)

    vndata.columns = vndata.columns.str.upper()
    return vndata

In [None]:
def fill_missing_values(vndata):
    columns_to_fill = ['AREA OF LAND', 'POPULATION DENSITY', 'GROSS REGIONAL DOMESTIC PRODUCT',
                        'STATE BUDGET REVENUE', 'STATE BUDGET EXPENDITURE', 'NUMBER OF FARM',
                        'RETAIL SALES OF GOODS', 'NUMBER OF SCHOOLS']

    mean_values_by_province = vndata.groupby('PROVINCE')[columns_to_fill].mean()

    for col in columns_to_fill:
        vndata[col] = vndata.apply(
            lambda row: row[col] if pd.notnull(row[col]) else mean_values_by_province.loc[row['PROVINCE'], col], axis=1)

    vndata['RETAIL SALES OF GOODS'].fillna(vndata['RETAIL SALES OF GOODS'].mean(), inplace=True)

    return vndata


In [None]:
def encode_categorical_column(vndata):
    categorical = vndata['DESCRIPTION']
    le = LabelEncoder()
    categorical = pd.DataFrame(le.fit_transform(categorical), columns=['DESCRIPTION'])
    vndata = pd.concat([vndata, categorical], axis=1)

    return vndata

def handle_outliers_iqr(vn_feature):
    for col in vn_feature.columns:
        z_scores = np.abs((vn_feature[col] - vn_feature[col].mean()) / vn_feature[col].std())
        total_outliers = len(vn_feature[z_scores > 3])
        print(f"Total outliers in '{col}': {total_outliers}")

    def handle_outliers_iqr(column):
        Q1 = column.quantile(0.25)
        Q3 = column.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        column_copy = column.copy()
        column_copy.loc[column < lower_bound] = lower_bound
        column_copy.loc[column > upper_bound] = upper_bound
        return column_copy

    vn_features_copy = vn_feature.copy()

    for col in vn_features_copy.columns:
        vn_features_copy[col] = handle_outliers_iqr(vn_features_copy[col])

    return vn_features_copy
