# Eastern Washington Digital Equity

## Nicholas Tran

# Preparation

## Import The Modules

In [561]:
import numpy as np  # matrix and array manipulation
import pandas as pd  # dataframe manipulation
import plotly.express as px
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from pingouin import cronbach_alpha
from scipy.stats import pearsonr
from sklearn import linear_model as lm
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  # scale the data
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from factor_analyzer.factor_analyzer import (
    calculate_kmo,
)  # get measure of sampling adequacy

# initialize the scaler
scaler = StandardScaler()


# use this as a method in corr() to get the pearson p values
def pearsonr_pval(x, y):
    return pearsonr(x, y)[1]


# turn scientific notation into decimals
pd.options.display.float_format = "{:.10f}".format

## Import The Dataset

In [562]:
dataset = pd.read_csv("../data/combined_data.csv")
dataset = dataset.drop(
    columns=[
        "GEOID",
        "tract",
        "has_computer",
        "smartphone",
        "desktop_or_laptop",
        "tablet_or_portable",
        "foreign_born",
        "broadband",
        # "median_income",
        "naturalized_citizen",
        "native_citizen",
        "desktop_or_laptop_only",
        "satellite",
        "dial_up",
        "other_internet_service",
        "not_citizen",
        "no_internet_access",
        # "number_providers",
        "mean_income",
        # "no_computer",
        # "internet_subscription",
        # "work_from_home",
        "mean_lat_ms",
        "mean_u_mbps",
        "access_with_no_subscription",
        "sixty_five_and_older"
    ]
)
dataset.head()

Unnamed: 0,work_from_home,smartphone_only,no_computer,internet_subscription,median_income,mean_d_mbps,number_providers,lowest_cost
0,82,153,76,910,52589.0,62.6586413043,12.0,30.0
1,47,68,64,487,59886.0,61.9772073171,12.0,30.0
2,3,76,183,327,39928.0,88.0868093023,8.0,30.0
3,26,132,55,515,58884.0,87.0107878788,8.0,30.0
4,110,97,24,557,50915.0,74.1417594937,8.0,30.0


In [563]:
dataset.columns

Index(['work_from_home', 'smartphone_only', 'no_computer',
       'internet_subscription', 'median_income', 'mean_d_mbps',
       'number_providers', 'lowest_cost'],
      dtype='object')

## Create A Class for The Data

In [1]:
class PCA:
    """Input a df and get many things back.
    https://stackoverflow.com/questions/13224362/principal-component-analysis-pca-in-python
    """

    def __init__(self, df):
        import numpy as np
        from scipy import linalg as LA

        self.data = df

        # scale data
        self.scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

        # kmo, total kmo
        self.kmo, self.total_kmo = calculate_kmo(self.scaled)

        # center data
        self.center = self.scaled.apply(lambda x: x - x.mean())

        # covariance
        self.cov = pd.DataFrame(
            np.cov(self.center, rowvar=False),
            columns=self.scaled.columns,
            index=self.scaled.columns,
        )

        # eigenvalues and loadings(eigenvectors)
        self.eigenvalues, self.loadings = LA.eigh(self.cov)

        # sort eigenvalues and loadings from
        sorter = np.argsort(self.eigenvalues)[::-1]
        self.loadings = self.loadings[:, sorter]
        self.eigenvalues = self.eigenvalues[sorter]

        pc_list = ["pc" + str(i + 1) for i in range(len(self.eigenvalues))]

        # turn into dataframe
        self.loadings = pd.DataFrame(
            self.loadings, index=self.scaled.columns, columns=pc_list
        )
        self.eigenvalues = pd.DataFrame(self.eigenvalues, index=pc_list)

        # pca scores - scaled data * loadings
        self.scores = self.scaled @ self.loadings

        # percent explained
        explained_variance = self.eigenvalues / self.eigenvalues.sum() * 100
        self.percent_explained = pd.DataFrame(explained_variance).round(2)

        self.percent_explained[
            "cumulative_explained_variance"
        ] = self.percent_explained.cumsum().round(2)
        self.percent_explained.columns.values[0] = "explained_variance"

        # scree plot
        self.scree = (
            px.line(
                self.percent_explained,
                x=pc_list,
                y="cumulative_explained_variance",
                text="cumulative_explained_variance",
                color=px.Constant("cumulative explained variance"),
            )
            .update_traces(textposition="top left")
            .add_bar(
                x=pc_list,
                y=self.percent_explained.explained_variance,
                name="explained variance",
                text=self.percent_explained.explained_variance,
            )
        )

    def calculate_weights(self, number_of_components):
        """calculate coefficients using your eigenvalues. Multiplies each row by the respective
        eigenvalue. Row 1 of loadings will be multiplied by eigenvalue 1. Row 2 with 2. You may
        only use this when you have at least 2 pcs.

        Args:
            number_of_components (int): number of pcs you want to use.

        Returns:
            DataFrame: Returns a dataframe of weights.
        """
        weights = (
            self.loadings.iloc[:, 0:number_of_components]
            .mul(
                [
                    float(self.eigenvalues.iloc[i, :])
                    for i in range(len(self.eigenvalues))
                ],
                axis=0,
            )
            .sum(axis=1)
        )
        return weights


data = PCA(dataset)
data.weights = data.calculate_weights(3)
data.scaled["index"] = data.scaled @ data.weights


NameError: name 'dataset' is not defined

In [565]:
data.eigenvalues


Unnamed: 0,0
pc1,2.2642864711
pc2,2.0337426503
pc3,1.4129920527
pc4,0.7009338232
pc5,0.6080935676
pc6,0.4285827162
pc7,0.3531377115
pc8,0.2421870514


In [566]:
data.percent_explained


Unnamed: 0,explained_variance,cumulative_explained_variance
pc1,28.15,28.15
pc2,25.28,53.43
pc3,17.57,71.0
pc4,8.71,79.71
pc5,7.56,87.27
pc6,5.33,92.6
pc7,4.39,96.99
pc8,3.01,100.0


In [567]:
data.scree


https://stackoverflow.com/questions/52741236/how-to-calculate-p-values-for-pairwise-correlation-of-columns-in-pandas

In [568]:
data.scaled.corr(pearsonr_pval).iloc[:, -1:]


Unnamed: 0,index
work_from_home,0.0085073617
smartphone_only,0.0
no_computer,0.0
internet_subscription,0.0
median_income,0.0002085952
mean_d_mbps,0.0017105449
number_providers,0.1016892245
lowest_cost,0.0094710914
index,1.0


In [569]:
pvals = data.scaled.corr(pearsonr_pval).loc[
    data.scaled.corr(pearsonr_pval)["index"] >= 0.05
]
pvals.loc[:, ["index"]]


Unnamed: 0,index
number_providers,0.1016892245
index,1.0


In [570]:
insig_vars = list(pvals.index)
# remove index col
del insig_vars[-1]
insig_vars

['number_providers']

In [571]:
data2 = PCA(dataset.drop(insig_vars, axis=1))
data2.weights = -data2.calculate_weights(2)
data.scaled["index"] = data2.scaled @ data2.weights


In [572]:
data2.eigenvalues

Unnamed: 0,0
pc1,2.0538834114
pc2,1.9068347455
pc3,1.1845017844
pc4,0.6880969678
pc5,0.6068890942
pc6,0.3559646439
pc7,0.2422908913


In [573]:
data2.scree

In [574]:
# set y as index. we want to predict the index
y = data.scaled["index"]

# set x as the data we will use to predict y
X = data.scaled.drop("index", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=55
)
X_test, X_valid, y_test, y_valid = train_test_split(
    X_test, y_test, test_size=0.4, random_state=55
)


In [575]:
print(X_train.shape, X_test.shape, X_valid.shape)


(91, 8) (55, 8) (37, 8)


In [576]:
lr = lm.LinearRegression()
sfs = SFS(
    lr,
    k_features="parsimonious",
    verbose=0,
    forward=True,
    scoring="r2",  # picks model on r2
    cv=5,
    n_jobs=-1,
)

sfs.fit(X_train, y_train)


In [577]:
results = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
results


Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(0,)","[0.9245147756590134, 0.8120314707973161, 0.870...",0.8644728675,"(work_from_home,)",0.0772410814,0.0600961855,0.0300480927
2,"(0, 3)","[0.9590702448163114, 0.923527095437384, 0.9271...",0.9252892685,"(work_from_home, internet_subscription)",0.0474817985,0.0369424523,0.0184712262
3,"(0, 3, 4)","[0.9853485514851957, 0.9636897726506866, 0.981...",0.967612782,"(work_from_home, internet_subscription, median...",0.0210759096,0.0163977737,0.0081988868
4,"(0, 3, 4, 5)","[0.9941784026360071, 0.9955614966461233, 0.998...",0.9947624468,"(work_from_home, internet_subscription, median...",0.0029026011,0.0022583223,0.0011291612
5,"(0, 3, 4, 5, 7)","[0.9965007420426125, 0.9988796962051927, 0.998...",0.9979087649,"(work_from_home, internet_subscription, median...",0.0011158293,0.0008681531,0.0004340766
6,"(0, 2, 3, 4, 5, 7)","[0.9987795822531295, 0.9994596841600869, 0.999...",0.999190091,"(work_from_home, no_computer, internet_subscri...",0.0003579675,0.0002785109,0.0001392554
7,"(0, 1, 2, 3, 4, 5, 7)","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"(work_from_home, smartphone_only, no_computer,...",0.0,0.0,0.0
8,"(0, 1, 2, 3, 4, 5, 6, 7)","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"(work_from_home, smartphone_only, no_computer,...",0.0,0.0,0.0


In [578]:
largest_before_1 = results[results["avg_score"] != 1].tail(1).index.to_list()

In [579]:
largest_before_1 = largest_before_1[0] - 1

In [580]:
model_vars = list(results.iloc[largest_before_1, 3])
model_vars

['work_from_home',
 'no_computer',
 'internet_subscription',
 'median_income',
 'mean_d_mbps',
 'lowest_cost']

In [581]:
final_X = data.scaled[model_vars]
final_y = data.scaled["index"]
model = lr
model.fit(final_X, final_y)
r2 = model.score(final_X, final_y)
observations = final_X.shape[0]
predictors = final_X.shape[1]
adj_r2 = 1 - (1 - r2) * (observations - 1) / (observations - predictors - 1)
print(f"adjr2: {adj_r2}")


adjr2: 0.9992836762477292


In [582]:
fin_df = pd.DataFrame(-model.coef_, model.feature_names_in_)
fin_df["vif"] = [VIF(final_X.values, i) for i in range(final_X.shape[1])]
fin_df

Unnamed: 0,0,vif
work_from_home,1.4004396025,1.6738694104
no_computer,-0.0703200361,1.3793897511
internet_subscription,0.4679108164,1.6202267798
median_income,0.3767310587,1.7724465461
mean_d_mbps,0.2164096462,1.5331091743
lowest_cost,0.1319262967,1.4404103588


In [583]:
coefficients = -model.coef_
coefficients

array([ 1.4004396 , -0.07032004,  0.46791082,  0.37673106,  0.21640965,
        0.1319263 ])

In [584]:
final_vars = list(model.feature_names_in_)
final_vars

['work_from_home',
 'no_computer',
 'internet_subscription',
 'median_income',
 'mean_d_mbps',
 'lowest_cost']

In [585]:
final_data = data.scaled[final_vars]

In [586]:
final_data["index"] = final_data.mul(coefficients).sum(axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [587]:
final_data

Unnamed: 0,work_from_home,no_computer,internet_subscription,median_income,mean_d_mbps,lowest_cost,index
0,-0.5115079191,-0.3341503606,-0.7430128489,-0.3742606917,-1.6263184336,-0.9471266092,-1.6583997596
1,-0.8676949403,-0.4867380487,-1.5525370030,-0.0492787646,-1.6343848222,-0.9471266092,-2.4045881650
2,-1.3154729099,1.0264231913,-1.8587399927,-0.9381357333,-1.3253157117,-0.9471266092,-3.5493299006
3,-1.0814071531,-0.6011788147,-1.4989514798,-0.0939042122,-1.3380529792,-0.9471266092,-2.6234412094
4,-0.2265583020,-0.9953636755,-1.4185731950,-0.4488145831,-1.4903884747,-0.9471266092,-1.5276206923
...,...,...,...,...,...,...,...
178,-1.0407000649,-0.6647570181,0.6119353806,-1.7800513629,0.1229907201,-2.1620799491,-2.0535801212
179,0.0990984031,0.2380534696,-0.1535720937,0.2185629936,-0.8491271755,-2.1620799491,-0.3364716886
180,-0.8269878522,0.4923662831,-0.3870518734,-0.2711589840,-1.8089348075,-2.1620799491,-2.1527356654
181,0.2313964396,0.5305132051,0.0301497001,-0.3293680259,-0.3223962253,-2.1620799491,-0.1782296180


In [588]:
cronbach_alpha(final_data)

(0.6929522540438978, array([0.62 , 0.756]))