In [7]:
library(caret)
library(ggplot2)
library(lattice) 

# Cleaning to only unique to 03 values 

In [15]:
# File paths for the three datasets
file_path_01 <- '../cleaned_data/new-cleaned_data-01.csv'
file_path_02 <- '../cleaned_data/cleaning_data-02.csv'
file_path_03 <- '../cleaned_data/cleaned_data-03.csv'

# Read the three datasets into data frames
df_01 <- read.csv(file_path_01)
df_02 <- read.csv(file_path_02)
df_03 <- read.csv(file_path_03)

# Get the column names of each dataset
col_names_01 <- colnames(df_01)
col_names_02 <- colnames(df_02)
col_names_03 <- colnames(df_03)

# Find the columns unique to dataset 03
unique_cols_03 <- setdiff(col_names_03, union(col_names_01, col_names_02))

# Display the unique column names
print(unique_cols_03)


 [1] "D1A_1V2F"  "D1A_1V5F"  "D1A_1V6F"  "D1A_1V10F" "D1A_1V2L"  "D1A_1V5L" 
 [7] "D1A_1V6L"  "D1A_1V10L" "D1C_1V2F"  "D1C_1V10F" "D1C_1V2L"  "Q2_1"     
[13] "Q2_2"      "Q2_3"      "Q2_4V1"    "Q2_4V13"   "Q2_5"      "Q2_6"     
[19] "Q2_7"      "Q2_8"      "Q2_9"      "Q2_10"     "Q2_11"     "Q2_12"    
[25] "Q2_13"     "Q2_14"     "Q2_15"     "Q2_16"     "Q2_17"     "Q2_18"    
[31] "Q2_19A"    "Q2_19B"    "Q2_19C"    "Q2_20A"    "A15V2"     "A15V3"    


In [16]:
# store column names 
column_info <- list(
    D1A_1V2F = "Internal: Primary purpose of trip: fear wild animals - First, Head",
    D1A_1V5F = "Internal: Primary purpose of trip: cyclone - First, Head",
    D1A_1V6F = "Internal: Primary purpose of trip: flooding - First, Head",
    D1A_1V10F = "Internal: Primary purpose of trip: refugee - First, Head",
    D1A_1V2L = "Internal: Primary purpose of trip: fear wild animals - Last, Head",
    D1A_1V5L = "Internal: Primary purpose of trip: cyclone - Last, Head",
    D1A_1V6L = "Internal: Primary purpose of trip: flooding - Last, Head",
    D1A_1V10L = "Internal: Primary purpose of trip: refugee - Last, Head",
    D1C_1V2F = "Internal: Primary purpose of trip: fear wild animals - First, Other",
    D1C_1V10F = "Internal: Primary purpose of trip: refugee - First, Other",
    D1C_1V2L = "Internal: Primary purpose of trip: fear wild animals - Last, Other",
    Q2_1 = "Health: What do you mean by fairly healthy? - Head",
    Q2_2 = "Health: Weight - Head",
    Q2_3 = "Health: Sought medical care in the last year? - Head",
    Q2_4V1 = "Health: If yes, where?: MBBS (private) - Head",
    Q2_4V13 = "Health: If yes, where?: Pharmacist/drug store - Head",
    Q2_5 = "Health: Health problem that you cannot access care for? - Head",
    Q2_6 = "Health: Why unable to get: not too serious - Head",
    Q2_7 = "Health: Quality of health when you got married? - Head",
    Q2_8 = "Health: Quality of health last year? - Head",
    Q2_9 = "Health: Current quality of health? - Head",
    Q2_10 = "Health: What do you mean by fairly healthy? - Head",
    Q2_11 = "Health: Hypertension or high blood pressure - Head",
    Q2_12 = "Health: Diabetes or high sugar levels - Head",
    Q2_13 = "Health: Heart attack or cardiac problems - Head",
    Q2_14 = "Health: Stroke - Head",
    Q2_15 = "Health: Chronic lung conditions/Respiratory problems - Head",
    Q2_16 = "Health: Gastro-intestinal problems - Head",
    Q2_17 = "Health: Psychiatric problems - Head",
    Q2_18 = "Health: Cancer - Head",
    Q2_19A = "Health: Do you have migration experience within Bangladesh? - Head",
    Q2_19B = "Health: Quality of health prior to last migration within Bangladesh? - Head",
    Q2_19C = "Health: Quality of health after last migration within Bangladesh? - Head",
    Q2_20A = "Health: Do you have migration experience outside Bangladesh? - Head",
    A15V2 = "Have migration experience? International",
    A15V3 = "Household: Have migration experience?: No migration"
)

# Access the description for a specific column
print(column_info["D1A_1V2F"])


$D1A_1V2F
[1] "Internal: Primary purpose of trip: fear wild animals - First, Head"



### Low Percentage of completion columns that have above required sample size 
D1A_1V6F # 91.5% missing still 2001 values 
D1A_1V10F # 91.5% missing 
D1A_1V2L # 91.5% missing 
D1A_1V5L # 91.5% missing 
D1A_1V6L # 91.5% missing 

In [None]:
# kept columns 

D1A_1V2F # missing 81.3%, but have 4404 values 
D1C_1V2F 
D1C_1V10F
D1C_1V2L
Q2_1
Q2_2 


# keep ? 
A15V2 # missing 95% of data 

# questions of interest 
Q2_11 
# health, gastrointestinal distress seems higher 
Q2_19A # missing 70% of data 


# Modeling 

## Trying to predict Occupation with Health Data 

In [24]:
# read in 003 
df <- read.csv(file_path_03)

# replace NA values with 99999
df <- replace(df, is.na(df), 99999)

In [26]:
# Filter the DataFrame to include only necessary columns
necessary_columns <- c(
    "A14", "Q2_1", "Q2_2", "Q2_3", "Q2_4V1", "Q2_4V13", "Q2_5", "Q2_6",
    "Q2_7", "Q2_8", "Q2_9", "Q2_10", "Q2_11", "Q2_12", "Q2_13", "Q2_14",
    "Q2_15", "Q2_16", "Q2_17", "Q2_18", "Q2_19A", "Q2_19B", "Q2_19C", "Q2_20A"
)
df <- df[, necessary_columns]

#Convert the DataFrame to factors if needed
df <- as.data.frame(lapply(df, as.factor))

# Convert variables to dummy variables if needed
df <- fastDummies::dummy_cols(df)

# Remove rows with no values for the independent variable (A14)
df <- df[!is.na(df$A14), ]

# Replace NA values with 0 if needed
df <- replace(df, is.na(df), 0)

# Print the head of the filtered DataFrame
print(head(df))


    A14  Q2_1  Q2_2  Q2_3 Q2_4V1 Q2_4V13  Q2_5  Q2_6  Q2_7  Q2_8  Q2_9 Q2_10
1    10 99999 99999 99999  99999   99999 99999 99999 99999 99999 99999 99999
2    17 150.7  59.2     1      1   99999     2 99999     1     2     2     2
3    14 99999 99999 99999  99999   99999 99999 99999 99999 99999 99999 99999
4 99999 99999 99999 99999  99999   99999 99999 99999 99999 99999 99999 99999
5 99999 99999 99999 99999  99999   99999 99999 99999 99999 99999 99999 99999
6     2 163.2  68.1     1  99999      13     1     3     1     2     2     1
  Q2_11 Q2_12 Q2_13 Q2_14 Q2_15 Q2_16 Q2_17 Q2_18 Q2_19A Q2_19B Q2_19C Q2_20A
1 99999 99999 99999 99999 99999 99999 99999 99999  99999  99999  99999  99999
2     3     3     3     3     3     2     3     3      2  99999  99999      2
3 99999 99999 99999 99999 99999 99999 99999 99999  99999  99999  99999  99999
4 99999 99999 99999 99999 99999 99999 99999 99999  99999  99999  99999  99999
5 99999 99999 99999 99999 99999 99999 99999 99999  99999  99999  99999 

In [27]:
# split the dataset into training and test sets
set.seed(123)  # For reproducibility
train_idx <- sample(1:nrow(df), 0.8 * nrow(df))  # 80% of data for training
train <- df[train_idx, ]
test <- df[-train_idx, ]

# print the head of the filtered DataFrame
print(head(df))

# print the dimensions of the training and test sets
print(dim(train))
print(dim(test))

    A14  Q2_1  Q2_2  Q2_3 Q2_4V1 Q2_4V13  Q2_5  Q2_6  Q2_7  Q2_8  Q2_9 Q2_10
1    10 99999 99999 99999  99999   99999 99999 99999 99999 99999 99999 99999
2    17 150.7  59.2     1      1   99999     2 99999     1     2     2     2
3    14 99999 99999 99999  99999   99999 99999 99999 99999 99999 99999 99999
4 99999 99999 99999 99999  99999   99999 99999 99999 99999 99999 99999 99999
5 99999 99999 99999 99999  99999   99999 99999 99999 99999 99999 99999 99999
6     2 163.2  68.1     1  99999      13     1     3     1     2     2     1
  Q2_11 Q2_12 Q2_13 Q2_14 Q2_15 Q2_16 Q2_17 Q2_18 Q2_19A Q2_19B Q2_19C Q2_20A
1 99999 99999 99999 99999 99999 99999 99999 99999  99999  99999  99999  99999
2     3     3     3     3     3     2     3     3      2  99999  99999      2
3 99999 99999 99999 99999 99999 99999 99999 99999  99999  99999  99999  99999
4 99999 99999 99999 99999 99999 99999 99999 99999  99999  99999  99999  99999
5 99999 99999 99999 99999 99999 99999 99999 99999  99999  99999  99999 

In [28]:
# Train a logistic regression model
model <- glm(A14 ~ ., data = train, family = binomial)

# Print the summary of the model
print(summary(model))

"glm.fit: algorithm did not converge"



Call:
glm(formula = A14 ~ ., family = binomial, data = train)

Coefficients: (1030 not defined because of singularities)
                Estimate Std. Error z value Pr(>|z|)
(Intercept)   -1.758e+02  1.483e+16   0.000    1.000
Q2_1131.5     -9.803e+02  2.941e+16   0.000    1.000
Q2_1132.2     -9.803e+02  2.941e+16   0.000    1.000
Q2_1133.2     -9.803e+02  2.941e+16   0.000    1.000
Q2_1133.6     -9.803e+02  2.941e+16   0.000    1.000
Q2_1134.1     -9.803e+02  2.941e+16   0.000    1.000
Q2_1134.7     -9.803e+02  2.941e+16   0.000    1.000
Q2_1134.8     -9.803e+02  2.941e+16   0.000    1.000
Q2_1135.1     -9.803e+02  2.941e+16   0.000    1.000
Q2_1135.3     -9.803e+02  2.941e+16   0.000    1.000
Q2_1135.5     -9.803e+02  2.941e+16   0.000    1.000
Q2_1135.6      2.024e+02  1.483e+16   0.000    1.000
Q2_1136.1     -9.803e+02  2.941e+16   0.000    1.000
Q2_1136.2     -9.803e+02  2.941e+16   0.000    1.000
Q2_1136.5     -9.803e+02  2.941e+16   0.000    1.000
Q2_1136.6     -9.803e+02  2.94

In [None]:
# INSERT CODE for evaluating the model's performance using metrics such as accuracy, precision, recall, and F1 score

## Next Model 

## Model 1 

Independent variable: 


Dependent variable: 

Who are these people? 
* A13: Household: Level of education (Highest level passed)
    1 No Schooling 2130 9.1 %
    2 Class I-IV (Incomplete PE) 5012 21.3 %
    3 Class V (Complete PE) 2212 9.4 %
    4 Class VI-IX (Incomplete SE) 5837 24.8 %
    5 SSC (Complete SE) 1341 5.7 %
    6 College (11 and 12 grades) 710 3.0 %
    7 HSC (complete HSE) 1057 4.5 %
    8 University level 2075 8.8 %
    9 Adult informal education 8 0.0 %
* 

In [10]:
df3 <-df[, c("D1A_1V1L", "A12", "A13", "A14", "D1A_7ML", "A15V1", "A15V4")]
# temporarily taking out D1A_6AL

#converting cateogorical variables to factor
df3 <- as.data.frame(lapply(df3, factor))
print(head(df3))

ERROR: Error in `[.data.frame`(df, , c("D1A_1V1L", "A12", "A13", "A14", "D1A_7ML", : undefined columns selected
