In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model

In [100]:
# load data
data = pd.read_csv("merged_train.csv")
data.head()

Unnamed: 0,State,County,FIPS,Total Population,"Percent White, not Hispanic or Latino","Percent Black, not Hispanic or Latino",Percent Hispanic or Latino,Percent Foreign Born,Percent Female,Percent Age 29 and Under,Percent Age 65 and Older,Median Household Income,Percent Unemployed,Percent Less than High School Degree,Percent Less than Bachelor's Degree,Percent Rural,Democratic,Republican,Party
0,AZ,apache,4001,72346,18.571863,0.486551,5.947806,1.719515,50.598513,45.854643,13.322091,32460,15.807433,21.758252,88.941063,74.061076,16298,7810,1
1,AZ,cochise,4003,128177,56.299492,3.714395,34.403208,11.458374,49.069646,37.902276,19.756275,45383,8.567108,13.409171,76.837055,36.301067,17383,26929,0
2,AZ,coconino,4005,138064,54.619597,1.342855,13.711033,4.825298,50.581614,48.946141,10.873943,51106,8.238305,11.085381,65.791439,31.466066,34240,19249,1
3,AZ,gila,4007,53179,63.222325,0.55285,18.548675,4.249798,50.29617,32.23829,26.397638,40593,12.129932,15.729958,82.262624,41.062,7643,12180,0
4,AZ,graham,4009,37529,51.461536,1.811932,32.097844,4.385942,46.313518,46.393456,12.315809,47422,14.424104,14.580797,86.675944,46.437399,3368,6870,0


In [141]:
# task 1 
# Partition dataset into training, validation sets using holdout method 75/25 split

# Xvariables has all the names except for State, County, FIPS, Party, Democratic, Republican
# Yvariables has Party, Democratic, Republican
Yvariables = ['Party','Democratic','Republican']
Xvariables = ['Total Population', 'Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino','Percent Hispanic or Latino','Percent Foreign Born','Percent Female','Percent Age 29 and Under','Percent Age 65 and Older','Median Household Income','Percent Unemployed','Percent Less than High School Degree','Percent Less than Bachelor\'s Degree','Percent Rural']
X_train, X_val, Y_train, Y_val = train_test_split(data[Xvariables], data[Yvariables], test_size = 0.25, random_state = 1)
X_train.head()

Unnamed: 0,Total Population,"Percent White, not Hispanic or Latino","Percent Black, not Hispanic or Latino",Percent Hispanic or Latino,Percent Foreign Born,Percent Female,Percent Age 29 and Under,Percent Age 65 and Older,Median Household Income,Percent Unemployed,Percent Less than High School Degree,Percent Less than Bachelor's Degree,Percent Rural
943,15919,91.940449,5.207614,1.432251,1.300333,51.077329,31.66028,23.902255,45538,4.560986,15.537543,83.711604,52.393846
853,76,72.368421,0.0,15.789474,11.842105,47.368421,11.842105,25.0,56875,0.0,25.373134,97.014925,100.0
578,60878,95.579684,0.877164,1.404448,1.342028,50.962581,40.464536,16.324452,48619,6.093624,10.433639,78.108081,55.700005
1035,54562,95.484037,1.268282,1.414904,1.611011,50.500348,32.005792,20.301675,43835,6.425857,15.913833,75.223085,71.676143
822,54217,85.32748,0.673221,11.634727,3.225925,50.709187,32.939853,23.861519,58315,3.243313,11.760327,74.23305,32.845532


In [142]:
# task 2
# standardize the training and validation sets by using X_train as the scalar and applying to the training and validation sets
scaler = StandardScaler()
scaler.fit(X_train) # find the mean and standard diviation for the columns in X_train
x_train_scaled = scaler.transform(X_train) # scales X_train using the results from fit method
x_val_scaled = scaler.transform(X_val)  # scales X_val using the results from fit method
x_train_scaled

array([[-0.345068  ,  0.65924303, -0.04363108, ...,  0.33240678,
         0.56141981, -0.09521071],
       [-0.39488816, -0.34555147, -0.59845393, ...,  1.87960605,
         2.01297272,  1.36549077],
       [-0.20368919,  0.84607517, -0.50500024, ..., -0.47046883,
        -0.04999222,  0.00623229],
       ...,
       [-0.30811876,  0.79654542, -0.54416126, ..., -1.11037852,
        -1.07924513,  0.41391348],
       [ 0.2187051 , -0.47707575,  1.33755335, ..., -0.29816431,
         0.07718104, -0.74801204],
       [-0.38811151,  0.67491051, -0.59845393, ..., -1.54891473,
        -0.30144791,  1.36549077]])

In [156]:
# task 3
# predict democratic votes using 2 predictors
model = linear_model.LinearRegression()
fitted_model = model.fit(X=X_train[['Percent Foreign Born', 'Median Household Income']],y=Y_train['Democratic'])
predicted = fitted_model.predict(X_train[['Percent Foreign Born','Median Household Income']])
print (predicted)

# Evaluate linear regression model using 2 predictors on democratic votes
X_train_dummy = pd.get_dummies(X_train, drop_first = True)
X_val_dummy = pd.get_dummies(X_val, drop_first = True)
model = linear_model.LinearRegression().fit(X = X_train_dummy[['Percent Foreign Born', 'Median Household Income']], y = Y_train['Democratic'])
score_val = model.score(X = X_val_dummy[['Percent Foreign Born', 'Median Household Income']], y = Y_val['Democratic']) # R squared (validation)
print(score_val)

# Evaluate LASSO regression model on democratic votes
model = linear_model.Lasso(alpha = 1).fit(X = X_train_dummy, y = Y_train['Democratic'])
score_val = model.score(X = X_val_dummy, y = Y_val['Democratic']) # R squared (validation)
print(score_val)

# predict republican votes using 2 predictors
model = linear_model.LinearRegression()
fitted_model = model.fit(X=X_train[['Percent Foreign Born', 'Median Household Income']],y=Y_train['Republican'])
predicted = fitted_model.predict(X_train[['Percent Foreign Born', 'Median Household Income']])
print (predicted)

# Evaluate linear regression model using 2 predictors on republican votes
X_train_dummy = pd.get_dummies(X_train, drop_first = True)
X_val_dummy = pd.get_dummies(X_val, drop_first = True)
model = linear_model.LinearRegression().fit(X = X_train_dummy[['Percent Foreign Born', 'Median Household Income']], y = Y_train['Republican'])
score_val = model.score(X = X_val_dummy[['Percent Foreign Born', 'Median Household Income']], y = Y_val['Republican']) # R squared (validation)
print(score_val)

# Evaluate LASSO regression model on republican votes
model = linear_model.Lasso(alpha = 1).fit(X = X_train_dummy, y = Y_train['Republican'])
score_val = model.score(X = X_val_dummy, y = Y_val['Republican']) # R squared (validation)
print(score_val)

# it seems like lasso regression is better

[-8.64500143e+02  7.34376317e+04  2.55502563e+03 -7.71882399e+02
  2.37294394e+04  2.55936755e+04 -3.96595520e+03  1.27399410e+04
  4.61918278e+03  8.78250688e+03  4.59768939e+04 -1.31442070e+04
  3.58378972e+03  1.03745826e+05  1.53726698e+05 -8.47852415e+03
  1.28514163e+04  3.21290605e+04  4.86999821e+04  1.73089390e+04
  9.98973568e+02  2.31113216e+04 -9.04997898e+03  1.25519808e+03
  1.15681834e+04  5.22840254e+04  2.17122790e+03  9.62024998e+02
  1.35764442e+04  5.95582311e+03  8.47764802e+04 -4.44991529e+03
 -7.16743463e+02  1.38431985e+05  1.32033382e+05  3.25366947e+04
  2.58102023e+04  5.07805093e+03  1.38532115e+05  3.71478571e+04
 -6.21961324e+02 -8.40920238e+03  1.54228764e+05  1.98910791e+04
  1.24689928e+05  3.19162925e+04  1.09811268e+05  1.09691847e+04
  3.30036029e+04  2.62358862e+04 -3.09282466e+03  4.94857099e+04
  1.24629910e+04  5.87986862e+04 -3.18127421e+03  6.66043275e+03
  7.95044189e+02  3.29243621e+04  3.09045540e+04  3.04105439e+04
  5.22325038e+04  2.50667