# Data Handling

**Author:** Marco A. Garcia

**Description:**  This is a Jupyter Notebook will server for data anaylze. 

### Libraries and Initial Settings

In [1]:
import os

while os.getcwd() != "/" and "pyproject.toml" not in os.listdir(os.getcwd()):
	os.chdir("..")
	if os.getcwd() == "/":
		print("COULD NOT FIND pyproject.toml.  Invalid project base file.")


print("Current Working Directory:  ", os.getcwd())

Current Working Directory:   /home/mgarcia/github/obese_classifier


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

### Code

In [None]:
filepath = "../datasets/obesity_prediction.csv"
df = pd.read_csv(filepath)

In [10]:
df.shape

(2111, 17)

In [5]:
from src.uafscs.utils.data_utils import Dataset
d = Dataset(filepath=filepath,dropList=["FAVC","FCVC","SMOKE","SCC","CALC"],label="Obesity")



In [11]:
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history', 'FAVC', 'FCVC',
       'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS',
       'Obesity'],
      dtype='object')

In [6]:
d.transform_categorical()

In [7]:
d.scale_data()

In [8]:
d.data

Unnamed: 0,Age,Height,Weight,NCP,CH2O,FAF,TUE,x0_Female,x0_Male,x0_no,x0_yes,x0_Always,x0_Frequently,x0_Sometimes,x0_no.1,x0_Automobile,x0_Bike,x0_Motorbike,x0_Public_Transportation,x0_Walking
0,0.148936,0.320755,0.186567,0.666667,0.500000,0.000000,0.500000,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.148936,0.132075,0.126866,0.666667,1.000000,1.000000,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.191489,0.660377,0.283582,0.666667,0.500000,0.666667,0.500000,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.276596,0.660377,0.358209,0.666667,0.500000,0.666667,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.170213,0.622642,0.379104,0.000000,0.500000,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0.148443,0.491943,0.689616,0.666667,0.364070,0.558756,0.453124,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2107,0.169850,0.563366,0.707037,0.666667,0.502565,0.447130,0.299635,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2108,0.181362,0.570200,0.706637,0.666667,0.527097,0.471403,0.323144,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2109,0.220467,0.546132,0.704079,0.666667,0.926170,0.379702,0.293017,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
#How many records? 2111
# 17 features

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
drop_list = ["Gender","family_history","FAVC","FCVC","CAEC","SMOKE","SCC","CALC","MTRANS","Weight"]

In [None]:
enc = OneHotEncoder(handle_unknown="ignore",sparse_output=False).set_output(transform='pandas')

In [None]:
df_gender  = enc.fit_transform(np.asarray(df["Gender"]).reshape(-1,1))
df_mtrans  = enc.fit_transform(np.asarray(df["MTRANS"]).reshape(-1,1))
df_caec    = enc.fit_transform(np.asarray(df["CAEC"]).reshape(-1,1))
df_fh      = enc.fit_transform(np.asarray(df["family_history"]).reshape(-1,1))



In [None]:
df_gender.head()

In [None]:
df_mtrans.shape

In [None]:
df_data = pd.concat(ignore_index=False, objs=[df,df_mtrans,df_gender,df_caec,df_fh],axis=1).drop(labels=drop_list,axis=1)

In [None]:
df_data = df_data.drop(labels=["Obesity"],axis=1)

In [None]:
df_data

In [None]:
cols = df_data.shape[1]
scalers = {}

df_scaled_data = pd.DataFrame(columns=df_data.columns)

for i in range(cols):
    scaler = MinMaxScaler()

    scaler.fit(np.asarray(df_data.iloc[:,i]).reshape(-1,1))
    scaled_feature = scaler.transform(np.asarray(df_data.iloc[:,i]).reshape(-1,1))
    scalers[df_data.columns[i]] = scaler
    df_scaled_data[df_data.columns[i]] = pd.Series(scaled_feature.reshape(-1))


In [None]:
df_scaled_data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
obesity_series = df["Obesity"]

In [None]:
le = LabelEncoder()

In [None]:
y = le.fit_transform(obesity_series)

In [None]:
le.classes_

In [None]:
X = np.asarray(df_scaled_data)

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [None]:
def count_class():
    classes = {}

    for i in y_train:
        
        if i not in classes:
            classes[i] = 1
        else:
            classes[i]+=1
    return classes


In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(criterion='entropy',random_state=0)

In [None]:
clf.fit(X_train,y_train)

In [None]:
results = clf.score(X_test,y_test)

In [None]:
results

In [None]:
y_test[233]

In [None]:
clf.predict(X_test[233].reshape(1,-1))

In [None]:
df["CAEC"].unique()

In [None]:
df.describe()

In [None]:
df.isnull()

In [None]:


newdf = df.select_dtypes(include='number')


In [None]:
type(newdf)

In [None]:
newdf

In [None]:
if "FAF" in newdf.columns:
    print("False")

In [None]:
df.columns

In [None]:
type(df.info())

In [None]:
type(df.dtypes.iloc[4])

In [None]:
for type in df.dtypes:
    if type == np.float64:
        print("hi")

In [None]:
drop_list = ["Gender","family_history","FAVC","FCVC","CAEC","SMOKE","SCC","CALC","MTRANS","Weight"]
df_gender  = enc.fit_transform(np.asarray(df["Gender"]).reshape(-1,1))
df_mtrans  = enc.fit_transform(np.asarray(df["MTRANS"]).reshape(-1,1))
df_caec    = enc.fit_transform(np.asarray(df["CAEC"]).reshape(-1,1))
df_fh      = enc.fit_transform(np.asarray(df["family_history"]).reshape(-1,1))

In [None]:
d = Dataset(filepath=filepath,dropList=["FAVC","FCVC","SMOKE","SCC","CALC"])

In [None]:
d.transform_categorical()


In [None]:
d.data