# Classfication

## Load and setup data

In [39]:
import os
import sqlalchemy
import pandas as pd
from pathlib import Path
import numpy as np 
import matplotlib as mpl
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
#print(os.getcwd())

In [3]:
dbEngine=sqlalchemy.create_engine('sqlite:///../../data/gdc.sqlite')
with dbEngine.connect() as conn:
    sqlstatement_obese = "select d.*, c.*,\
  case\
   when age_at_diagnosis_years <18 then 'Under 18'\
   when age_at_diagnosis_years between 18 and 24 then '18 - 24'\
   when age_at_diagnosis_years between 25 and 34 then '25 - 34'\
   when age_at_diagnosis_years between 35 and 44 then '35 - 44'\
   when age_at_diagnosis_years between 45 and 54 then '45 - 54'\
   when age_at_diagnosis_years between 55 and 64 then '55 - 64'\
   when age_at_diagnosis_years >= 65 then '65 or older'\
 END as age_range,\
 CASE \
 	when c.\"gene.SIM1\" = '1' or c.\"gene.POMC\" = 1 or c.\"gene.LEPR\" ='1' or c.\"gene.MRAP2\" = 1 or c.\"gene.ADCY3\" = 1 or c.\"gene.NTRK2\" = 1 or c.\"gene.MC4R\"  = 1 or c.\"gene.KSR2\" = 1 or c.\"gene.LEP\" = 1 or c.\"gene.PCSK1\" = 1 or c.\"gene.BDNF\"  =1 or c.\"gene.SH2B1\" = 1 then TRUE \
 	ELSE FALSE \
 END as obesity_mutation\
 from diagnoses d, cases c\
 where d.case_id = c.id  and year_of_diagnosis = '2011' and year_of_diagnosis is not null;"
    gdc = pd.read_sql_query(sqlstatement_obese, conn)
    gdc['year_of_diagnosis'] = gdc['year_of_diagnosis'].astype(int)
print(gdc.dtypes)
print(gdc)

case_id                    object
year_of_diagnosis           int32
age_at_diagnosis_days     float64
age_at_diagnosis_years    float64
ajcc_clinical_m            object
ajcc_clinical_t            object
ajcc_clinical_n            object
ajcc_clinical_stage        object
ajcc_pathologic_t          object
ajcc_pathologic_n          object
ajcc_pathologic_m          object
ajcc_pathologic_stage      object
id                         object
index_date                 object
primary_site               object
disease_type               object
gene.SIM1                   int64
gene.POMC                   int64
gene.LEPR                   int64
gene.MRAP2                  int64
gene.ADCY3                  int64
gene.NTRK2                  int64
gene.MC4R                   int64
gene.KSR2                   int64
gene.LEP                    int64
gene.PCSK1                  int64
gene.BDNF                   int64
gene.SH2B1                  int64
age_range                  object
obesity_mutati

In [5]:
dbEngine=sqlalchemy.create_engine('sqlite:///../../data/brfss.sqlite')
with dbEngine.connect() as conn:
    sqlstatement_obese = "select YearStart, Data_Value, \"Age(years)\"  from brfss where LocationDesc = 'National' and length(\"Age(years)\") > 0 and Question like '%aged 18 years and older who have obesity' and YearStart = 2011 order by StratificationID1;"
    brfss_obesity = pd.read_sql_query(sqlstatement_obese, conn)
    brfss_obesity['Data_Value'] = brfss_obesity['Data_Value'].astype("float")
    
print(brfss_obesity)

   YearStart  Data_Value   Age(years)
0       2011        15.2      18 - 24
1       2011        25.9      25 - 34
2       2011        29.9      35 - 44
3       2011        32.6      45 - 54
4       2011        32.6      55 - 64
5       2011        25.3  65 or older


In [6]:
brfss_gdc = gdc

def obesity_prob(age_group):
    ob_lines = brfss_obesity.loc[brfss_obesity['Age(years)'] == age_group]
    ob_line = ob_lines.iloc[0]
    return ob_line['Data_Value']

brfss_gdc['obesity_prob'] = brfss_gdc['age_range'].map(obesity_prob)
print(brfss_gdc)

                                                case_id  year_of_diagnosis  \
0     RUNhc2U6MDA3ODFhOTYtNDA2OC00MjdjLWE5YzUtNTg0ZD...               2011   
1     RUNhc2U6NDk5NjdhNTctN2MxMi00NmJiLThkNzEtY2ViYm...               2011   
2     RUNhc2U6MDBkZGE4NjAtNTNjMy00YWIyLTgyOTQtNzNlMj...               2011   
3     RUNhc2U6NDdmMThlZWUtM2U1Ni00Mzg4LWFhYzMtOWFjMT...               2011   
4     RUNhc2U6MjNlZjIxYTAtMzY5NS00Y2EyLTk5ZTItMjc0OD...               2011   
...                                                 ...                ...   
1646  RUNhc2U6MWNhM2M1ZTAtMzJiMC00NDY3LThlYzAtY2EyMT...               2011   
1647  RUNhc2U6N2I4MmUyZDItNDdmOS00MWExLTllY2EtNDA5MD...               2011   
1648  RUNhc2U6NGIzNzA2MzktMGMwOS00ZjYwLWJiOTYtMTFjMG...               2011   
1649  RUNhc2U6NzJlNjQwZjItODY5Zi00YWVmLThlYmUtYjEyN2...               2011   
1650  RUNhc2U6ZTk4N2M0N2QtNmVjYi00MWJiLWE2NmYtYTA0OD...               2011   

      age_at_diagnosis_days  age_at_diagnosis_years ajcc_clinic

In [30]:
xdf = pd.DataFrame(brfss_gdc, columns=['age_at_diagnosis_years', 'primary_site', 'disease_type', 'obesity_prob'])

#https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python
def onehotEncoding(orig, feature):
    one_hot = pd.get_dummies(xdf[[feature]])
    res = pd.concat([xdf, one_hot], axis=1)
    return(res)

xdf = onehotEncoding(xdf, 'primary_site')
xdf = onehotEncoding(xdf, 'disease_type')
xdf.pop('primary_site')
xdf.pop('disease_type')
print(xdf)




      age_at_diagnosis_years  obesity_prob  primary_site_Adrenal gland  \
0                       85.0          25.3                       False   
1                       45.0          32.6                       False   
2                       58.0          32.6                       False   
3                       79.0          25.3                       False   
4                       75.0          25.3                       False   
...                      ...           ...                         ...   
1646                    75.0          25.3                       False   
1647                    76.0          25.3                       False   
1648                    19.0          15.2                       False   
1649                    71.0          25.3                       False   
1650                    59.0          32.6                       False   

      primary_site_Base of tongue  primary_site_Bladder  \
0                           False                 Fa

In [35]:
#Split dataset into train, validation and test
#https://realpython.com/train-test-split-python-data/
#https://blog.roboflow.com/train-test-split/
#https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test#38251213
y = brfss_gdc['obesity_mutation']

x_train, x_temp, y_train, y_temp = train_test_split(xdf, y, random_state=42, train_size=.7, test_size=.3)
x_validation, x_test, y_validation, y_test = train_test_split(x_temp, y_temp, random_state=42, train_size=2/3, test_size=1/3)



In [41]:
#Logistic regression
# LR = LogisticRegression(random_state=42, solver='lbfgs', multi_class='ovr').fit(x_train, y_train)
logreg = LogisticRegression(max_iter = 50000)
logreg.fit(x_train, y_train)

In [42]:
#Validation
prediction = logreg.predict(x_validation)
print(classification_report(y_validation, prediction))

              precision    recall  f1-score   support

           0       0.89      0.52      0.66        81
           1       0.86      0.98      0.92       249

    accuracy                           0.87       330
   macro avg       0.88      0.75      0.79       330
weighted avg       0.87      0.87      0.85       330

