### Importing packages

In [1]:
import os
import pymongo
import yaml
import rarfile
from tqdm import tqdm
import pandas as pd
import numpy as np
import requests
import ruamel.yaml
from thyroid.constant.env_variable import *
import certifi
ca = certifi.where()


client = pymongo.MongoClient(
    MONGO_DB_URL,
    tlsCAFile = ca
)

collection = client["ineuron"]["thyroid"]

import warnings
warnings.filterwarnings("ignore")

### Load raw files

In [2]:
dataset_link = "C:\\Users\\pm353\\Downloads\\thyroid+disease"

with open(os.path.join(dataset_link, "hypothyroid.data")) as f:
    data = f.readlines()

with open(os.path.join(dataset_link, "hypothyroid.names")) as f:
    colnames = f.readlines()

dataset = pd.DataFrame(
    [[i.strip() for i in j.split(",")] for j in data],
    columns=[
        i.strip().split(":")[0] for i in colnames if i != "\n"
    ]
).rename(
    columns={
        "hypothyroid, negative.": "Target"
    }
)

### Upload data to MongoDB

In [3]:
for i in tqdm(range(dataset.shape[0])):
    try:
        collection.insert_one(dict(dataset.iloc[i, :]))
    except:
        pass

100%|██████████████████████████████████████████████████████████████████████████████| 3163/3163 [01:48<00:00, 29.07it/s]


### Load data from MongoDB

In [10]:
df_thyroid = pd.DataFrame(
    list(collection.find())
).drop(
    columns=["_id"],
    axis=1
).replace(
    {
        "na": np.nan,
        "?": np.nan
    }
)

str_cols = [
    'tumor', 'query_on_thyroxine', 'thyroid_surgery', 'sick',
    'pregnant', 'FTI_measured', 'on_antithyroid_medication',
    'T3_measured', 'on_thyroxine', 'Target', 'query_hyperthyroid',
    'T4U_measured', 'TT4_measured', 'query_hypothyroid', 'goitre',
    'sex', 'TBG_measured', 'lithium', 'TSH_measured'
]

numerical_cols = ["T4U", "TSH", "age", "T3", "TT4", "FTI", "TBG"]

for i in str_cols:
    df_thyroid[i] = df_thyroid[i].astype("str")

for i in numerical_cols:
    df_thyroid[i] = df_thyroid[i].astype("float64")


def create_schema_from_dataframe(df):
    # Create a dictionary for the schema
    schema_dict = {'columns': []}

    # Iterate over columns in the DataFrame
    for column in df.columns:
        column_info = {column: df[column].dtype.name}
        schema_dict['columns'].append(column_info)

    # Create a YAML instance
    yaml = ruamel.yaml.YAML()

    # Save the dictionary to a YAML file
    with open('config/schema.yaml', 'w') as file:
        yaml.dump(schema_dict, file)


create_schema_from_dataframe(df=df_thyroid)