In [695]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
YR = 2015
INPUT_PATH = f'../2019_MCMProblemC_DATA/ACS_{YR%2000}_5YR_DP02/ACS_{YR%2000}_5YR_DP02_with_ann.csv'
EXPECTED_PATH = '../generated_data/SVM_Input2.xlsx'
TEST_RATIO = 0.8

## Prepare Dataset

In [696]:
def sumRowsDF(df):
    return df.drop([0]).apply(pd.to_numeric).sum(axis=1)

## Combine Specific Features into More General Groups

In [697]:
uscbYr = pd.read_csv(INPUT_PATH)
divorced = sumRowsDF(uscbYr.loc[:, ['HC01_VC41', 'HC01_VC40', 'HC01_VC39']])
# group education attainment
notCollegeGrad = sumRowsDF(uscbYr.loc[:, ['HC01_VC86', 'HC01_VC87', 'HC01_VC88', 'HC01_VC89']])
collegeGrad = sumRowsDF(uscbYr.loc[:, ['HC01_VC90', 'HC01_VC91', 'HC01_VC92']])
# group ethnicity
arab = uscbYr.loc[:, 'HC01_VC187']
arab = arab.rename(columns=arab.iloc[0]).drop([0])

africa = uscbYr.loc[:, 'HC01_VC207']
africa = africa.rename(columns=africa.iloc[0]).drop([0])

europe = sumRowsDF(uscbYr.loc[:, [
    'HC01_VC189', # Danish
    'HC01_VC190', # Dutch
    'HC01_VC192', # French
    'HC01_VC191', # German
    'HC01_VC192', # Greek
    'HC01_VC197', # Irish
    'HC01_VC198', # Italian
    'HC01_VC199', # Lithuanian
    'HC01_VC200', # Norwegian
    'HC01_VC201', # Polish
    'HC01_VC202', # Portuguese
    'HC01_VC204', # Scotch-Irish
    'HC01_VC205', # Scottish
    'HC01_VC208', # Swedish
    'HC01_VC209', # Swiss
    'HC01_VC211' # Welsh
]])
slavonic = sumRowsDF(uscbYr.loc[:, [
    'HC01_VC188', # Czech
    'HC01_VC196', # Hungarian
    'HC01_VC203', # Russian
    'HC01_VC206', # Slovak
    'HC01_VC210' # Ukrainian
]])
na = sumRowsDF(uscbYr.loc[:, [
    'HC01_VC191', # English
    'HC01_VC193', # French Canadian
    'HC01_VC212' # West Indian
]])
neverMarried = sumRowsDF(uscbYr.loc[:, [
    'HC01_VC37', # male, never married
    'HC01_VC44' # female, never married
]])
marriedSeparate = sumRowsDF(uscbYr.loc[:, [
    'HC01_VC38', # male, never married
    'HC01_VC45' # female, never married
]])
household = uscbYr.loc[:, [
    'HC01_VC03', # total households
    'HC01_VC21' # avg household size
]]
houshold = household.rename(columns=generalStats.iloc[0]).drop([0])
cleanedData = pd.concat([
    household,
    neverMarried,
    marriedSeparate,
    divorced,
    notCollegeGrad,
    collegeGrad,
    #arab,
    africa,
    europe,
    slavonic,
    na
], axis=1)

In [698]:
expected = pd.read_excel(EXPECTED_PATH).iloc[:, YR - 2001]
counties = uscbYr.iloc[:, 2]

## Split test and validation sets

In [699]:
partition = int(TEST_RATIO*uscbYr.shape[0])
uscbTest = cleanedData[:partition]
expectedTest = expected[:partition]
uscbVal = cleanedData[partition:]
expectedVal = expected[partition:]

## Scale data, run SVM

In [700]:
poly_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="poly", coef0=1, C=5))
])
poly_kernel_svm_clf.fit(uscbTest, expectedTest)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm_clf', SVC(C=5, cache_size=200, class_weight=None, coef0=1,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [701]:
pred = poly_kernel_svm_clf.predict(uscbVal)
accuracy_score = np.mean(pred == expectedVal)

## Results

In [702]:
accuracy_score

0.967391304347826