# Data into database

**Firstly, import the data**

In [1]:
import pandas as pd
import os
this_dir = os.getcwd()
this_dir
# train = pd.read_csv("./app/rootsRadar/train.csv", encoding='utf-8')
train = pd.read_csv("./train.csv", encoding='utf-8')

**Next, we drop fields and cull rows with null values to optimize the model by using only data that is helpfull**

In [2]:
# Drop fields without a probable relation to the target.
train.drop("Patient Id", axis=1, inplace=True)
train.drop("Family Name", axis=1, inplace=True)
train.drop("Patient First Name", axis=1, inplace=True)
train.drop("Father's name", axis=1, inplace=True)
train.drop("Institute Name", axis=1, inplace=True)
train.drop("Location of Institute", axis=1, inplace=True)
train.drop("Place of birth", axis=1, inplace=True)

# get rid of dead and after death data
train.drop(train[train["Status"] != "Alive"].index, inplace=True)
train.drop("Autopsy shows birth defect (if applicable)", axis=1, inplace=True)

# increases decision tree to 71%
train.drop("Parental consent", axis=1, inplace=True)
train.drop("Status", axis=1, inplace=True)
train.drop("Follow-up", axis=1, inplace=True)

# remove null age's
# train.drop(train[train["Patient Age"].isna()].index, inplace=True)
# excluding Leber's amd multifactoral is 46

# 64 on svm
train.drop(train[train["Disorder Subclass"] == "Leber's hereditary optic neuropathy"].index, inplace=True)
train.drop(train[train["Disorder Subclass"] == "Leigh syndrome"].index, inplace=True)
train.drop(train[train["Disorder Subclass"] == "Mitochondrial myopathy"].index, inplace=True)

# These two together: 62 percent

# not good because just diabetes and that is testable
# 92 on svm 
train.drop(train[train["Disorder Subclass"] == "Alzheimer's"].index, inplace=True)
train.drop(train[train["Disorder Subclass"] == "Cancer"].index, inplace=True)
train.drop(train[train["Disorder Subclass"] == "Diabetes"].index, inplace=True)

# 76 percent on svm 77 after moding afterwards
# train.drop(train[train["Disorder Subclass"] == "Cystic fibrosis"].index, inplace=True)
# train.drop(train[train["Disorder Subclass"] == "Hemochromatosis"].index, inplace=True)
# train.drop(train[train["Disorder Subclass"] == "Tay-Sachs"].index, inplace=True)

train = train[train['Disorder Subclass'].isna() == False]
train = train[train['Genetic Disorder'].isna() == False]

train.drop("Genetic Disorder", axis=1, inplace=True)

# Test removing more variables
train.drop("Test 1", axis=1, inplace=True)
train.drop("Test 2", axis=1, inplace=True)
train.drop("Test 3", axis=1, inplace=True)
train.drop("Test 4", axis=1, inplace=True)
train.drop("Test 5", axis=1, inplace=True)

train.isna().sum()

Patient Age                                         186
Genes in mother's side                                0
Inherited from father                                51
Maternal gene                                       407
Paternal gene                                         0
Blood cell count (mcL)                                0
Mother's age                                        863
Father's age                                        895
Respiratory Rate (breaths/min)                      291
Heart Rate (rates/min                               309
Gender                                              308
Birth asphyxia                                      321
Folic acid details (peri-conceptional)              302
H/O serious maternal illness                        331
H/O radiation exposure (x-ray)                      317
H/O substance abuse                                 310
Assisted conception IVF/ART                         306
History of anomalies in previous pregnancies    

**Now, fill the remaining null values with estimates**

In [3]:
# TODO:
# Better syntetic method, and reduce null efficiently 
# Remove null parent ages?
# First check how much parent age affects the result 

In [4]:
# Here, the solution was to fill with the mode
train["Patient Age"].fillna(str(train["Patient Age"].mode().values[0]),inplace=True)
train["Inherited from father"].fillna(str(train["Inherited from father"].mode().values[0]),inplace=True)
train["Maternal gene"].fillna(str(train["Maternal gene"].mode().values[0]),inplace=True)
train["Mother's age"].fillna(str(train["Mother's age"].mode().values[0]),inplace=True)
train["Father's age"].fillna(str(train["Father's age"].mode().values[0]),inplace=True)
train["Respiratory Rate (breaths/min)"].fillna(str(train["Respiratory Rate (breaths/min)"].mode().values[0]),inplace=True)
train["Heart Rate (rates/min"].fillna(str(train["Heart Rate (rates/min"].mode().values[0]),inplace=True)
# train["Test 1"].fillna(str(train["Test 1"].mode().values[0]),inplace=True)
# train["Test 2"].fillna(str(train["Test 2"].mode().values[0]),inplace=True)
# train["Test 3"].fillna(str(train["Test 3"].mode().values[0]),inplace=True)
# train["Test 4"].fillna(str(train["Test 4"].mode().values[0]),inplace=True)
# train["Test 5"].fillna(str(train["Test 5"].mode().values[0]),inplace=True)
# train["Parental consent"].fillna(str(train["Parental consent"].mode().values[0]),inplace=True)
# train["Follow-up"].fillna(str(train["Follow-up"].mode().values[0]),inplace=True)
train["Gender"].fillna(str(train["Gender"].mode().values[0]),inplace=True)
train["Birth asphyxia"].fillna(str(train["Birth asphyxia"].mode().values[0]),inplace=True)
# train["Autopsy shows birth defect (if applicable)"].fillna(str(train["Autopsy shows birth defect (if applicable)"].mode().values[0]),inplace=True)
train["Folic acid details (peri-conceptional)"].fillna(str(train["Folic acid details (peri-conceptional)"].mode().values[0]),inplace=True)
train["H/O serious maternal illness"].fillna(str(train["H/O serious maternal illness"].mode().values[0]),inplace=True)
train["H/O radiation exposure (x-ray)"].fillna(str(train["H/O radiation exposure (x-ray)"].mode().values[0]),inplace=True)
train["H/O substance abuse"].fillna(str(train["H/O substance abuse"].mode().values[0]),inplace=True)
train["Assisted conception IVF/ART"].fillna(str(train["Assisted conception IVF/ART"].mode().values[0]),inplace=True)
train["History of anomalies in previous pregnancies"].fillna(str(train["History of anomalies in previous pregnancies"].mode().values[0]),inplace=True)
train["No. of previous abortion"].fillna(str(train["No. of previous abortion"].mode().values[0]),inplace=True)
train["Birth defects"].fillna(str(train["Birth defects"].mode().values[0]),inplace=True)
train["White Blood cell count (thousand per microliter)"].fillna(str(train["White Blood cell count (thousand per microliter)"].mode().values[0]),inplace=True)
train["Blood test result"].fillna(str(train["Blood test result"].mode().values[0]),inplace=True)
train["Symptom 1"].fillna(str(train["Symptom 1"].mode().values[0]),inplace=True)
train["Symptom 2"].fillna(str(train["Symptom 2"].mode().values[0]),inplace=True)
train["Symptom 3"].fillna(str(train["Symptom 3"].mode().values[0]),inplace=True)
train["Symptom 4"].fillna(str(train["Symptom 4"].mode().values[0]),inplace=True)
train["Symptom 5"].fillna(str(train["Symptom 5"].mode().values[0]),inplace=True)

In [5]:
train.head()

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Respiratory Rate (breaths/min),Heart Rate (rates/min,...,No. of previous abortion,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Disorder Subclass
6,3.0,Yes,No,Yes,Yes,4.90108,30.0,63.0,Normal (30-60),Tachycardia,...,3.0,Multiple,6.825974,normal,0.0,0.0,0.0,0.0,0.0,Tay-Sachs
7,3.0,No,No,Yes,Yes,4.964816,40.0,49.0,Tachypnea,Normal,...,1.0,Singular,9.836352,inconclusive,0.0,0.0,1.0,0.0,0.0,Tay-Sachs
16,0.0,Yes,Yes,No,No,4.79852,30.0,57.0,Normal (30-60),Tachycardia,...,4.0,Multiple,12.0,normal,1.0,1.0,1.0,0.0,1.0,Cystic fibrosis
20,2.0,No,No,Yes,No,4.808872,30.0,30.0,Tachypnea,Tachycardia,...,4.0,Singular,9.566103,slightly abnormal,1.0,0.0,0.0,1.0,0.0,Hemochromatosis
26,12.0,No,No,Yes,Yes,4.710696,30.0,32.0,Normal (30-60),Tachycardia,...,0.0,Multiple,11.373537,normal,1.0,1.0,0.0,0.0,0.0,Tay-Sachs


**Before fitting the model, we need to encode the remaining data**

In [6]:
train["Genes in mother's side"]=[1 if i.strip()== "Yes" else 0 for i in train["Genes in mother's side"]]
train["Inherited from father"]=[1 if i.strip()== "Yes" else 0 for i in train["Inherited from father"]]
train["Maternal gene"]=[1 if i.strip()== "Yes" else 0 for i in train["Maternal gene"]]
train["Paternal gene"]=[1 if i.strip()== "Yes" else 0 for i in train["Paternal gene"]]
train["Birth asphyxia"]=[1 if i.strip()== "Yes" else 0 for i in train["Birth asphyxia"]]
train["Folic acid details (peri-conceptional)"]=[1 if i.strip()== "Yes" else 0 for i in train["Folic acid details (peri-conceptional)"]]
train["H/O radiation exposure (x-ray)"]=[1 if i.strip()== "Yes" else 0 for i in train["H/O radiation exposure (x-ray)"]]
train["H/O substance abuse"]=[1 if i.strip()== "Yes" else 0 for i in train["H/O substance abuse"]]
train["Assisted conception IVF/ART"]=[1 if i.strip()== "Yes" else 0 for i in train["Assisted conception IVF/ART"]]
train["History of anomalies in previous pregnancies"]=[1 if i.strip()== "Yes" else 0 for i in train["History of anomalies in previous pregnancies"]]
train["H/O serious maternal illness"]=[1 if i.strip()=="Yes" else 0 for i in train["H/O serious maternal illness"]]

#Normal (30-60):1' 'Tachypnea:0
train["Respiratory Rate (breaths/min)"]=[1 if i.strip()== "Normal (30-60)" else 0 for i in train["Respiratory Rate (breaths/min)"]]
#Normal:1' 'Tachycardia:0
train["Heart Rate (rates/min"]=[1 if i.strip()== "Normal" else 0 for i in train["Heart Rate (rates/min"]]
#['Singular' 'Multiple']
train["Birth defects"]=[1 if i.strip()== "Singular" else 0 for i in train["Birth defects"]]

#1: male 0: female 2: ambiguous    
train["Gender"]=[1 if i.strip()== "Male" else 0 if i.strip() == "Female" else 2 for i in train["Gender"]]

train["Blood test result"]=[1 if i.strip()== "slightly abnormal" else 0 if i.strip() == "normal" else 2 if i.strip()=="inconclusive" else 3 for i in train["Blood test result"]]

#Leber's hereditary optic neuropathy:1 
#Cystic fibrosis:0
#Diabetes:2
#Leigh syndrome:3
#Cancer:4
#Tay-Sachs:5
#Hemochromatosis:6
#Mitochondrial myopathy:7
#Alzheimer's:8
train["Disorder Subclass"]=[1 if i.strip()== "Leber's hereditary optic neuropathy" 
                              else 0 if i.strip() == "Cystic fibrosis" 
                               else 2 if i.strip()=="Diabetes" 
                               else 3 if i.strip()=="Leigh syndrome"
                               else 4 if i.strip()=="Cancer"
                               else 5 if i.strip()=="Tay-Sachs"
                               else 6 if i.strip()=="Hemochromatosis"
                               else 7 if i.strip()=="Mitochondrial myopathy"
                               else 8 for i in train["Disorder Subclass"]]

train = train.apply(pd.to_numeric,downcast="float")

## Now we have cleaned data: Need to import it into database

In [7]:
# Weirdness that now somehow app module is magically found??

import os
import sys

this_dir = os.getcwd()
this_dir

os.chdir('/mnt/c/Users/archi/Desktop/larks/backend/')

this_dir = os.getcwd()
this_dir

# sys.path.append('./')
# sys.path.append('./app')

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/archi/Desktop/larks/backend'

In [None]:
from app import models, db, create_app
app = create_app()

In [None]:
for index, row in train.iterrows():
    new_record = models.Patient(
         PatientAge = row['Patient Age'],
         GenesInMothersSide = row["Genes in mother's side"],
         InheritedFromFather = row['Inherited from father'],
         MaternalGene = row['Maternal gene'],
         PaternalGene = row['Paternal gene'],
         BloodCellCount_mcL = row['Blood cell count (mcL)'],
         MothersAge = row["Mother's age"],
         FathersAge = row["Father's age"],
         RespiratoryRate_breathsPerMin = row['Respiratory Rate (breaths/min)'],
         HeartRate_ratesPermin = row['Heart Rate (rates/min'],
         Gender = row['Gender'],
         BirthAsphyxia = row['Birth asphyxia'],
         FolicAcidDetails_periConceptiona = row['Folic acid details (peri-conceptional)'],
         HistoryOfSeriousMaternalIllness = row['H/O serious maternal illness'],
         HistoryOfRadiationExposure_xRay = row['H/O radiation exposure (x-ray)'],
         HistoryOfSubstanceAbuse = row['H/O substance abuse'],
         AssistedConception_IVF_ART = row['Assisted conception IVF/ART'],
         HistoryOfAnomaliesInPreviousPregnancies = row['History of anomalies in previous pregnancies'],
         NumberOfPreviousAbortions = row['No. of previous abortion'],
         BirthDefects = row['Birth defects'],
         WhiteBloodCellCount_thousand_per_microliter = row['White Blood cell count (thousand per microliter)'],
         BloodTestResult = row['Blood test result'],
         Symptom1 = row['Symptom 1'],
         Symptom2 = row['Symptom 2'],
         Symptom3 = row['Symptom 3'],
         Symptom4 = row['Symptom 4'],
         Symptom5 = row['Symptom 5'],
         DisorderSubclass = row['Disorder Subclass'],
         DisorderSubclassPredicted = None
    )
    print(new_record.PatientAge)
    db.session.add(new_record)

db.session.commit()

In [None]:
train.describe()

In [None]:
# 3490 rows found in database - DONE