## Lab | Final regression model in "Health Care for All" Case


Import Datasets

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

Concatenating all 3 datasets

In [None]:
data = pd.concat([numerical, categorical, targets], axis = 1) #concat because we can treat for imbalance, we use label to select which group certain labels belong to

Filtering TARGET B for 1

In [None]:
data_all = pd.DataFrame(data[data['TARGET_B'] == 1])
data_all

Dropping TARGET B

In [None]:
data_donated = data_all.drop(columns=['TARGET_B'],axis=0)
data_donated

In [None]:
y = data_donated['TARGET_D']
X = data_donated.drop(['TARGET_D'], axis = 1)

##### X-Y Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

##### Splitting in categorical and numerical

In [None]:
X_num_train = X_train.select_dtypes(np.number).reset_index()
X_num_test = X_test.select_dtypes(np.number).reset_index()

X_cat_train = X_train.select_dtypes(np.object)
X_cat_test = X_test.select_dtypes (np.object)

##### MinMax split

In [None]:
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(X_num_train)
x_normalized = transformer.transform(X_num_train)
x_norm = pd.DataFrame(x_normalized, columns=X_num_train.columns)
x_norm.head()

In [None]:
X_test_normalized = transformer.transform(X_num_test)
X_test_norm = pd.DataFrame(X_test_normalized, columns = X_num_train.columns)

##### One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_cat_train)
cols = encoder.get_feature_names_out(input_features=X_cat_train.columns)
X_cat_train_encoded = pd.DataFrame(encoder.transform(X_cat_train).toarray(),columns=cols)
X_cat_train_encoded.head()

In [None]:
encoder_cat_test = encoder.transform(X_cat_test)
cols = encoder.get_feature_names_out(input_features=X_cat_test.columns)
X_cat_test_encoded = pd.DataFrame(encoder.transform(X_cat_test).toarray(),columns=cols)
X_cat_test_encoded.head()

##### Concatenating the transformed data

In [None]:
X_train_transformed = pd.concat([X_cat_train_encoded,x_norm],axis =1)
X_test_transformed = pd.concat([X_test_norm,X_cat_test_encoded],axis = 1)

In [None]:
x_norm = pd.concat([X_train_transformed, X_test_transformed],axis = 0)
x_norm

In [None]:
numerical_scaled = pd.concat([x_norm,X_test_norm],axis =1)

##### Variance Threshold Feature Selection

In [None]:
from sklearn.feature_selection import VarianceThreshold 
var_threshold = 0.040
sel = VarianceThreshold(threshold=(var_threshold))

# 1) default is threshold = 0 :eliminate all coumns that are identical for all rows
# 2) in practice we would scale the columns first, and then apply threshold, or apply 
#    a different threshold for different columns

# This drops the columns that have a variance less than this threshold
sel = sel.fit(x_norm)
temp = sel.transform(x_norm)
temp = pd.DataFrame(temp)
print(x_norm.shape)
print(temp.shape)

In [None]:
sel.variances_ > var_threshold
sel.get_support()
var_list = list(sel.get_support())

In [None]:
[col[0] for col in zip(x_norm.columns, var_list) if col[1] == False]

##### Dropping all of the columns that were suggested

In [None]:
drop_col = ['STATE_GA',
 'STATE_IN',
 'STATE_MO',
 'STATE_WA',
 'STATE_WI',
 'GENDER_other',
 'TCODE',
 'AGE',
 'HIT',
 'MALEMILI',
 'MALEVET',
 'VIETVETS',
 'WWIIVETS',
 'LOCALGOV',
 'STATEGOV',
 'FEDGOV',
 'POP901',
 'POP902',
 'POP903',
 'POP90C4',
 'POP90C5',
 'ETH1',
 'ETH2',
 'ETH3',
 'ETH4',
 'ETH5',
 'ETH6',
 'ETH7',
 'ETH8',
 'ETH9',
 'ETH10',
 'ETH11',
 'ETH12',
 'ETH13',
 'ETH14',
 'ETH15',
 'ETH16',
 'AGE901',
 'AGE902',
 'AGE903',
 'AGE904',
 'AGE905',
 'AGE906',
 'AGE907',
 'CHIL1',
 'CHIL2',
 'CHIL3',
 'AGEC1',
 'AGEC2',
 'AGEC3',
 'AGEC4',
 'AGEC5',
 'AGEC6',
 'AGEC7',
 'CHILC1',
 'CHILC2',
 'CHILC3',
 'CHILC4',
 'CHILC5',
 'HHAGE1',
 'HHAGE2',
 'HHAGE3',
 'HHN1',
 'HHN2',
 'HHN3',
 'HHN4',
 'HHN5',
 'HHN6',
 'MARR1',
 'MARR2',
 'MARR3',
 'MARR4',
 'HHP1',
 'HHP2',
 'DW3',
 'DW7',
 'DW8',
 'DW9',
 'HV1',
 'HV2',
 'HV3',
 'HV4',
 'HU3',
 'HU4',
 'HHD1',
 'HHD2',
 'HHD3',
 'HHD4',
 'HHD5',
 'HHD6',
 'HHD7',
 'HHD8',
 'HHD9',
 'HHD10',
 'HHD11',
 'HHD12',
 'ETHC1',
 'ETHC2',
 'ETHC3',
 'ETHC4',
 'ETHC5',
 'ETHC6',
 'HUR1',
 'RHP1',
 'RHP2',
 'RHP3',
 'RHP4',
 'HUPA1',
 'HUPA2',
 'HUPA3',
 'HUPA4',
 'HUPA5',
 'HUPA6',
 'HUPA7',
 'DMA',
 'IC1',
 'IC2',
 'IC3',
 'IC4',
 'IC5',
 'IC6',
 'IC7',
 'IC8',
 'IC9',
 'IC10',
 'IC11',
 'IC12',
 'IC13',
 'IC14',
 'IC15',
 'IC16',
 'IC17',
 'IC18',
 'IC19',
 'IC20',
 'IC21',
 'IC22',
 'IC23',
 'HHAS1',
 'HHAS2',
 'HHAS3',
 'HHAS4',
 'MC1',
 'MC2',
 'MC3',
 'TPE1',
 'TPE2',
 'TPE3',
 'TPE4',
 'TPE5',
 'TPE6',
 'TPE7',
 'TPE8',
 'TPE9',
 'PEC1',
 'TPE10',
 'TPE11',
 'TPE12',
 'TPE13',
 'LFC1',
 'LFC2',
 'LFC3',
 'LFC4',
 'LFC5',
 'LFC6',
 'LFC7',
 'LFC10',
 'OCC1',
 'OCC2',
 'OCC3',
 'OCC4',
 'OCC5',
 'OCC6',
 'OCC7',
 'OCC8',
 'OCC9',
 'OCC10',
 'OCC11',
 'OCC12',
 'OCC13',
 'EIC1',
 'EIC2',
 'EIC3',
 'EIC4',
 'EIC5',
 'EIC6',
 'EIC7',
 'EIC8',
 'EIC9',
 'EIC10',
 'EIC11',
 'EIC12',
 'EIC13',
 'EIC14',
 'EIC15',
 'EIC16',
 'OEDC1',
 'OEDC2',
 'OEDC3',
 'OEDC4',
 'OEDC5',
 'OEDC6',
 'OEDC7',
 'EC1',
 'EC2',
 'EC3',
 'EC4',
 'EC5',
 'EC6',
 'EC7',
 'EC8',
 'SEC1',
 'SEC2',
 'SEC3',
 'SEC4',
 'SEC5',
 'AFC1',
 'AFC2',
 'AFC3',
 'AFC4',
 'AFC5',
 'AFC6',
 'VC1',
 'VC2',
 'VC3',
 'VC4',
 'ANC1',
 'ANC2',
 'ANC3',
 'ANC4',
 'ANC5',
 'ANC6',
 'ANC7',
 'ANC8',
 'ANC9',
 'ANC10',
 'ANC11',
 'ANC12',
 'ANC13',
 'ANC14',
 'ANC15',
 'POBC1',
 'LSC1',
 'LSC2',
 'LSC3',
 'LSC4',
 'VOC1',
 'VOC2',
 'VOC3',
 'HC1',
 'HC3',
 'HC4',
 'HC9',
 'HC10',
 'HC12',
 'HC14',
 'HC15',
 'HC16',
 'HC20',
 'HC21',
 'MHUC1',
 'MHUC2',
 'AC1',
 'AC2',
 'CARDPROM',
 'NUMPROM',
 'CARDPM12',
 'NUMPRM12',
 'RAMNTALL',
 'NGIFTALL',
 'CARDGIFT',
 'MINRAMNT',
 'MAXRAMNT',
 'LASTGIFT',
 'TIMELAG',
 'AVGGIFT',
 'ODATEW_MM',
 'MAXRDATE_YR',
 'FIRSTDATE_YR']

##### Dropping columns from the concatenated data

In [None]:
X_train= X_train_transformed.drop(columns =drop_col,axis=1)
X_test= X_test_transformed.drop(columns =drop_col,axis=1)
X_test.shape

In [None]:
corr = pd.concat([X_train,y],axis = 1)
corr.shape

In [None]:
# take out the columns with negative collinearity with target B
negative_corr = corr.TARGET_D[corr.TARGET_D < 0].index.values.tolist()
# take out the columns with awfully high collinearity with target B
high_corr = corr.TARGET_D[corr.TARGET_D > .9].index.values.tolist()
to_drop = negative_corr + high_corr
print(len(X_train.columns))
for i in to_drop:
    try:
        X_train = X_train.drop(columns = i)
        X_test = X_test.drop(columns = i)
    except:
        pass
corr = pd.concat([X_train, y], axis = 1).corr()
print(len(X_train.columns))
print(len(X_test.columns))

In [None]:
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
X_train

In [None]:
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [None]:
predictions = lm.predict(X_train)
predictions_test = lm.predict(X_test)
r2_score = r2_score(y_test, predictions_test)
#mean_score_train = np.mean(cross_val_score(lm, X_test_transformed, y_test, cv = 5))
r2_score

In [None]:
X_train.isna().sum().sum()

In [None]:
y_test[:5]

In [None]:
predictions_test[:5]

In [None]:
print('the r2 score is :' ,r2_score)
print('the mse error is:',mean_squared_error(y_test,predictions_test))
print('the rmse error is:',np.sqrt(mean_squared_error(y_test,predictions_test)))
print('the mae is:',mean_absolute_error(y_test, predictions_test))

In [None]:
from sklearn.model_selection import cross_validate

results = cross_validate(lm,X_train_transformed, y_train, cv = 5)

In [None]:
results

In [None]:
print(results['test_score'])
print(results['test_score'].mean())

After several dozen tries with different feature selections and reviewing the columns muticulously I was still getting a negative R2 score (with varying degrees). At this point, I don't understand how this is possible. I would welcome the opportunity to sit with one of the TAs to further discuss if I'm missing a step or perhaps I'm approaching this problem incorrectly. 