# Eastern Washington Digital Equity

## Nicholas Tran

# Preparation

## Import The Modules

In [1]:
import numpy as np  # matrix and array manipulation
import pandas as pd  # dataframe manipulation
import plotly.express as px
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from pingouin import cronbach_alpha
from scipy.stats import pearsonr
from sklearn import linear_model as lm
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  # scale the data
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from factor_analyzer.factor_analyzer import (
    calculate_kmo,
)  # get measure of sampling adequacy

# initialize the scaler
scaler = StandardScaler()


# use this as a method in corr() to get the pearson p values
def pearsonr_pval(x, y):
    return pearsonr(x, y)[1]


# turn scientific notation into decimals
pd.options.display.float_format = "{:.10f}".format

## Import The Dataset

In [2]:
dataset = pd.read_csv("../app/data/combined_data.csv")
dataset = dataset.drop(
    columns=[
        "GEOID",
        "NAME",
    ]
)
dataset.head()

Unnamed: 0,has_computer,with_internet,sixtyfive_and_older,median_income,number_providers,mean_lowest_cost,mean_d_mbps,mean_u_mbps,mean_lat_ms
0,1664,1569,1337,73942,12.0,58.0,110.3274089069,41.7821012146,51.7044534413
1,1872,1780,1352,72988,13.0,60.0769230769,145.7848027682,22.8873079585,24.6262975779
2,1279,1156,396,38077,11.0,60.0,173.4078640777,27.0620194175,29.9805825243
3,941,805,647,38777,8.0,62.5,127.6004568966,23.5382931034,22.1637931034
4,1345,1198,635,42639,8.0,62.5,192.4299478261,21.6856521739,21.6608695652


In [3]:
dataset.columns

Index(['has_computer', 'with_internet', 'sixtyfive_and_older', 'median_income',
       'number_providers', 'mean_lowest_cost', 'mean_d_mbps', 'mean_u_mbps',
       'mean_lat_ms'],
      dtype='object')

## Create A Class for The Data

In [4]:
class PCA:
    """Input a df and get many things back.
    https://stackoverflow.com/questions/13224362/principal-component-analysis-pca-in-python
    """

    def __init__(self, df):
        import numpy as np
        from scipy import linalg as LA

        self.data = df

        # scale data
        self.scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

        # kmo, total kmo
        self.kmo, self.total_kmo = calculate_kmo(self.scaled)

        # center data
        self.center = self.scaled.apply(lambda x: x - x.mean())

        # covariance
        self.cov = pd.DataFrame(
            np.cov(self.center, rowvar=False),
            columns=self.scaled.columns,
            index=self.scaled.columns,
        )

        # eigenvalues and loadings(eigenvectors)
        self.eigenvalues, self.loadings = LA.eigh(self.cov)

        # sort eigenvalues and loadings from
        sorter = np.argsort(self.eigenvalues)[::-1]
        self.loadings = self.loadings[:, sorter]
        self.eigenvalues = self.eigenvalues[sorter]

        pc_list = ["pc" + str(i + 1) for i in range(len(self.eigenvalues))]

        # turn into dataframe
        self.loadings = pd.DataFrame(
            self.loadings, index=self.scaled.columns, columns=pc_list
        )
        self.eigenvalues = pd.DataFrame(self.eigenvalues, index=pc_list)

        # pca scores - scaled data * loadings
        self.scores = self.scaled @ self.loadings

        # percent explained
        explained_variance = self.eigenvalues / self.eigenvalues.sum() * 100
        self.percent_explained = pd.DataFrame(explained_variance).round(2)

        self.percent_explained[
            "cumulative_explained_variance"
        ] = self.percent_explained.cumsum().round(2)
        self.percent_explained.columns.values[0] = "explained_variance"

        # scree plot
        self.scree = (
            px.line(
                self.percent_explained,
                x=pc_list,
                y="cumulative_explained_variance",
                text="cumulative_explained_variance",
                color=px.Constant("cumulative explained variance"),
            )
            .update_traces(textposition="top left")
            .add_bar(
                x=pc_list,
                y=self.percent_explained.explained_variance,
                name="explained variance",
                text=self.percent_explained.explained_variance,
            )
        )

    def calculate_weights(self, number_of_components):
        """calculate coefficients using your eigenvalues. Multiplies each row by the respective
        eigenvalue. Row 1 of loadings will be multiplied by eigenvalue 1. Row 2 with 2. You may
        only use this when you have at least 2 pcs.

        Args:
            number_of_components (int): number of pcs you want to use.

        Returns:
            DataFrame: Returns a dataframe of weights.
        """
        weights = (
            self.loadings.iloc[:, 0:number_of_components]
            .mul(
                [
                    float(self.eigenvalues.iloc[i, :])
                    for i in range(len(self.eigenvalues))
                ],
                axis=0,
            )
            .sum(axis=1)
        )
        return weights


data = PCA(dataset)
data.weights = data.calculate_weights(3)
data.scaled["index"] = data.scaled @ data.weights


In [5]:
data3 = PCA(dataset)
data3.weights = data3.calculate_weights(1)
data3.scaled["index"] = data3.scaled @ data3.weights

In [29]:
-(data.weights)

has_computer           0.4928275307
with_internet          0.3898000846
sixtyfive_and_older   -0.4958515824
median_income         -0.0360867754
number_providers      -0.0458754832
mean_lowest_cost      -0.1168199233
mean_d_mbps            0.2830632037
mean_u_mbps            0.1683880828
mean_lat_ms           -0.0083437527
dtype: float64

In [28]:
-(data2.weights)

has_computer          -0.0375724530
with_internet          0.0657226189
sixtyfive_and_older   -0.1736790963
number_providers      -0.8881681625
mean_lowest_cost       0.4067865234
mean_d_mbps            0.2283296702
mean_u_mbps           -0.0132305210
mean_lat_ms           -0.0062663003
dtype: float64

In [27]:
-(data3.weights)

has_computer           1.4062236668
with_internet          1.0192632921
sixtyfive_and_older    0.2745504261
median_income          0.0568933230
number_providers      -0.2159936742
mean_lowest_cost       0.0843880270
mean_d_mbps            0.1409653539
mean_u_mbps            0.0391407695
mean_lat_ms           -0.0035762768
dtype: float64

In [6]:
pvals = data3.scaled.corr(pearsonr_pval).loc[
    data3.scaled.corr(pearsonr_pval)["index"] <= 0.05
]
pvals.loc[:, ["index"]]


Unnamed: 0,index
has_computer,0.0
with_internet,0.0
sixtyfive_and_older,0.0
median_income,0.0182518429
number_providers,0.0030140583
mean_lowest_cost,0.0144650028
mean_d_mbps,6.97e-08
mean_u_mbps,0.0069500518
mean_lat_ms,0.0001812135


In [7]:
data3.scaled.corr()

Unnamed: 0,has_computer,with_internet,sixtyfive_and_older,median_income,number_providers,mean_lowest_cost,mean_d_mbps,mean_u_mbps,mean_lat_ms,index
has_computer,1.0,0.9894357549,0.6307362243,0.1629198586,-0.1158696673,0.1088418476,0.3055403227,0.1964818898,-0.2013067357,-0.9881881244
with_internet,0.9894357549,1.0,0.6252920849,0.1604088786,-0.1288266448,0.1364057855,0.3450733917,0.1990615088,-0.2367166307,-0.9901033184
sixtyfive_and_older,0.6307362243,0.6252920849,1.0,0.1485403993,0.0529299215,0.0505012498,-0.0423267904,-0.1692074005,0.1020176554,-0.6582774684
median_income,0.1629198586,0.1604088786,0.1485403993,1.0,0.0219960412,-0.0537768136,-0.0477334553,-0.0181592068,0.0585207055,-0.1743469861
number_providers,-0.1158696673,-0.1288266448,0.0529299215,0.0219960412,1.0,-0.3294582912,-0.4802079801,-0.0162728201,0.4468024575,0.2181233426
mean_lowest_cost,0.1088418476,0.1364057855,0.0505012498,-0.0537768136,-0.3294582912,1.0,0.1966428248,0.0843635563,-0.1810913885,-0.1805260228
mean_d_mbps,0.3055403227,0.3450733917,-0.0423267904,-0.0477334553,-0.4802079801,0.1966428248,1.0,0.4043996639,-0.8315610286,-0.3856933725
mean_u_mbps,0.1964818898,0.1990615088,-0.1692074005,-0.0181592068,-0.0162728201,0.0843635563,0.4043996639,1.0,-0.4211462822,-0.1988979855
mean_lat_ms,-0.2013067357,-0.2367166307,0.1020176554,0.0585207055,0.4468024575,-0.1810913885,-0.8315610286,-0.4211462822,1.0,0.2733380225
index,-0.9881881244,-0.9901033184,-0.6582774684,-0.1743469861,0.2181233426,-0.1805260228,-0.3856933725,-0.1988979855,0.2733380225,1.0


In [8]:
data.loadings

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9
has_computer,-0.4599330157,0.3645541824,-0.0658100712,0.1030046567,-0.0020124875,0.2024724662,0.3176726339,-0.0920832487,0.7001618154
with_internet,-0.4726794787,0.3461412505,-0.0542300799,0.0996952198,0.0007207665,0.1493424729,0.3315990835,-0.0633792481,-0.7124617405
sixtyfive_and_older,-0.2404549975,0.5093231719,0.1654053857,0.1181182965,-0.1113232689,-0.2892429979,-0.7346965896,0.0749970045,0.0015542064
median_income,-0.0601137431,0.2171500761,-0.1189068825,-0.9452901001,0.1858748525,-0.0826221977,-0.0030092647,0.0113206672,0.0026193145
number_providers,0.2678621444,0.2950848253,-0.5060550011,0.2238628454,0.2792463404,-0.6281213636,0.247267641,0.0323400768,0.0163232028
mean_lowest_cost,-0.1897657037,-0.1367443265,0.5892062411,0.0698436374,0.7275385176,-0.2341802968,0.093370699,0.0018038822,0.0193336799
mean_d_mbps,-0.4303746918,-0.3465275353,-0.0873047267,-0.0484725848,-0.2182100276,-0.3113621644,0.1030934405,0.7268550329,0.0348722708
mean_u_mbps,-0.2452356751,-0.2388442646,-0.570952081,0.103778477,0.5079292254,0.3474238836,-0.4069605848,0.0390156881,-0.0149008535
mean_lat_ms,0.3879167451,0.3972659731,0.1198594669,0.0617521464,0.2003860575,0.4233050622,0.0568635403,0.6714621662,-0.0091692669


In [9]:
dataset2 = dataset.drop(columns=["median_income"], axis=1)
data2 = PCA(dataset2)
data2.weights = data2.calculate_weights(3)
data2.scaled["index"] = data2.scaled @ data2.weights

In [10]:
data2.scaled

Unnamed: 0,has_computer,with_internet,sixtyfive_and_older,number_providers,mean_lowest_cost,mean_d_mbps,mean_u_mbps,mean_lat_ms,index
0,0.5038285795,0.5181607150,2.2410640420,1.7141115800,-0.1850927319,-1.0620454274,0.2839152865,1.1831499562,2.2254805375
1,0.8839544650,0.9219659077,2.2886709790,2.2169546853,0.4203852967,-0.6423230132,-0.4316276622,-0.1545889278,2.3081161690
2,-0.1997698144,-0.2722257522,-0.7454778073,1.2112684747,0.3979601846,-0.3153385194,-0.2735319738,0.1099281688,0.8639079036
3,-0.8174743783,-0.9439585609,0.0511449389,-0.2972608413,1.1267763301,-0.8575779070,-0.4069749537,-0.2762437407,-0.4934723596
4,-0.0791529468,-0.1918474674,0.0130593893,-0.2972608413,1.1267763301,-0.0901670094,-0.4771341841,-0.3010896119,-0.6980837989
...,...,...,...,...,...,...,...,...,...
178,-1.9560245066,-1.8587399927,-1.1802878321,-0.2972608413,-1.6427250231,-1.3253157117,-0.7967759956,1.6012459572,0.5499991091
179,-1.5027974892,-1.4989514798,-1.1834616279,-0.2972608413,-1.6427250231,-1.3380529792,-0.6531932364,0.8234865651,0.5427647915
180,-1.4260413008,-1.4185731950,-1.1422022825,-0.2972608413,-1.6427250231,-1.4903884747,-0.7598419576,0.5007949880,0.5788815372
181,-1.2341508298,-1.1736108032,-1.0247718378,0.2055822641,-1.8694678239,0.8024865408,-0.0513030885,-0.1935302982,0.6107242206


In [39]:
dataset.corr()

Unnamed: 0,has_computer,with_internet,sixtyfive_and_older,median_income,number_providers,mean_lowest_cost,mean_d_mbps,mean_u_mbps,mean_lat_ms
has_computer,1.0,0.9894357549,0.6307362243,0.1629198586,-0.1158696673,0.1088418476,0.3055403227,0.1964818898,-0.2013067357
with_internet,0.9894357549,1.0,0.6252920849,0.1604088786,-0.1288266448,0.1364057855,0.3450733917,0.1990615088,-0.2367166307
sixtyfive_and_older,0.6307362243,0.6252920849,1.0,0.1485403993,0.0529299215,0.0505012498,-0.0423267904,-0.1692074005,0.1020176554
median_income,0.1629198586,0.1604088786,0.1485403993,1.0,0.0219960412,-0.0537768136,-0.0477334553,-0.0181592068,0.0585207055
number_providers,-0.1158696673,-0.1288266448,0.0529299215,0.0219960412,1.0,-0.3294582912,-0.4802079801,-0.0162728201,0.4468024575
mean_lowest_cost,0.1088418476,0.1364057855,0.0505012498,-0.0537768136,-0.3294582912,1.0,0.1966428248,0.0843635563,-0.1810913885
mean_d_mbps,0.3055403227,0.3450733917,-0.0423267904,-0.0477334553,-0.4802079801,0.1966428248,1.0,0.4043996639,-0.8315610286
mean_u_mbps,0.1964818898,0.1990615088,-0.1692074005,-0.0181592068,-0.0162728201,0.0843635563,0.4043996639,1.0,-0.4211462822
mean_lat_ms,-0.2013067357,-0.2367166307,0.1020176554,0.0585207055,0.4468024575,-0.1810913885,-0.8315610286,-0.4211462822,1.0


In [40]:
data.scaled.corr()

Unnamed: 0,has_computer,with_internet,sixtyfive_and_older,median_income,number_providers,mean_lowest_cost,mean_d_mbps,mean_u_mbps,mean_lat_ms,index
has_computer,1.0,0.9894357549,0.6307362243,0.1629198586,-0.1158696673,0.1088418476,0.3055403227,0.1964818898,-0.2013067357,-0.7156407761
with_internet,0.9894357549,1.0,0.6252920849,0.1604088786,-0.1288266448,0.1364057855,0.3450733917,0.1990615088,-0.2367166307,-0.7273210608
sixtyfive_and_older,0.6307362243,0.6252920849,1.0,0.1485403993,0.0529299215,0.0505012498,-0.0423267904,-0.1692074005,0.1020176554,-0.0039498525
median_income,0.1629198586,0.1604088786,0.1485403993,1.0,0.0219960412,-0.0537768136,-0.0477334553,-0.0181592068,0.0585207055,-0.0226180324
number_providers,-0.1158696673,-0.1288266448,0.0529299215,0.0219960412,1.0,-0.3294582912,-0.4802079801,-0.0162728201,0.4468024575,0.3018202163
mean_lowest_cost,0.1088418476,0.1364057855,0.0505012498,-0.0537768136,-0.3294582912,1.0,0.1966428248,0.0843635563,-0.1810913885,-0.0567047629
mean_d_mbps,0.3055403227,0.3450733917,-0.0423267904,-0.0477334553,-0.4802079801,0.1966428248,1.0,0.4043996639,-0.8315610286,-0.7063174282
mean_u_mbps,0.1964818898,0.1990615088,-0.1692074005,-0.0181592068,-0.0162728201,0.0843635563,0.4043996639,1.0,-0.4211462822,-0.5696037792
mean_lat_ms,-0.2013067357,-0.2367166307,0.1020176554,0.0585207055,0.4468024575,-0.1810913885,-0.8315610286,-0.4211462822,1.0,0.5928858018
index,-0.7156407761,-0.7273210608,-0.0039498525,-0.0226180324,0.3018202163,-0.0567047629,-0.7063174282,-0.5696037792,0.5928858018,1.0


In [11]:
-(data3.weights)

has_computer           1.4062236668
with_internet          1.0192632921
sixtyfive_and_older    0.2745504261
median_income          0.0568933230
number_providers      -0.2159936742
mean_lowest_cost       0.0843880270
mean_d_mbps            0.1409653539
mean_u_mbps            0.0391407695
mean_lat_ms           -0.0035762768
dtype: float64

In [12]:
X = data3.scaled.drop("index", axis=1)
y = data3.scaled.index

In [14]:
lr = lm.LinearRegression()
lr.fit(X, y)
r2 = lr.score(X, y)
observations = X.shape[0]
predictors = X.shape[1]
adj_r2 = 1 - (1 - r2) * (observations - 1) / (observations - predictors - 1)

In [25]:
X

Unnamed: 0,has_computer,with_internet,sixtyfive_and_older,median_income,number_providers,mean_lowest_cost,mean_d_mbps,mean_u_mbps,mean_lat_ms
0,0.5038285795,0.5181607150,2.2410640420,0.0743883874,1.7141115800,-0.1850927319,-1.0620454274,0.2839152865,1.1831499562
1,0.8839544650,0.9219659077,2.2886709790,0.0743689779,2.2169546853,0.4203852967,-0.6423230132,-0.4316276622,-0.1545889278
2,-0.1997698144,-0.2722257522,-0.7454778073,0.0736586986,1.2112684747,0.3979601846,-0.3153385194,-0.2735319738,0.1099281688
3,-0.8174743783,-0.9439585609,0.0511449389,0.0736729404,-0.2972608413,1.1267763301,-0.8575779070,-0.4069749537,-0.2762437407
4,-0.0791529468,-0.1918474674,0.0130593893,0.0737515144,-0.2972608413,1.1267763301,-0.0901670094,-0.4771341841,-0.3010896119
...,...,...,...,...,...,...,...,...,...
178,-1.9560245066,-1.8587399927,-1.1802878321,0.0736963580,-0.2972608413,-1.6427250231,-1.3253157117,-0.7967759956,1.6012459572
179,-1.5027974892,-1.4989514798,-1.1834616279,0.0740820259,-0.2972608413,-1.6427250231,-1.3380529792,-0.6531932364,0.8234865651
180,-1.4260413008,-1.4185731950,-1.1422022825,0.0739198932,-0.2972608413,-1.6427250231,-1.4903884747,-0.7598419576,0.5007949880
181,-1.2341508298,-1.1736108032,-1.0247718378,0.0741810268,0.2055822641,-1.8694678239,0.8024865408,-0.0513030885,-0.1935302982


In [23]:
data3.scaled

Unnamed: 0,has_computer,with_internet,sixtyfive_and_older,median_income,number_providers,mean_lowest_cost,mean_d_mbps,mean_u_mbps,mean_lat_ms,index
0,0.5038285795,0.5181607150,2.2410640420,0.0743883874,1.7141115800,-0.1850927319,-1.0620454274,0.2839152865,1.1831499562,-1.3274680720
1,0.8839544650,0.9219659077,2.2886709790,0.0743689779,2.2169546853,0.4203852967,-0.6423230132,-0.4316276622,-0.1545889278,-2.2650910072
2,-0.1997698144,-0.2722257522,-0.7454778073,0.0736586986,1.2112684747,0.3979601846,-0.3153385194,-0.2735319738,0.1099281688,1.0424657637
3,-0.8174743783,-0.9439585609,0.0511449389,0.0736729404,-0.2972608413,1.1267763301,-0.8575779070,-0.4069749537,-0.2762437407,2.0699980343
4,-0.0791529468,-0.1918474674,0.0130593893,0.0737515144,-0.2972608413,1.1267763301,-0.0901670094,-0.4771341841,-0.3010896119,0.1700845498
...,...,...,...,...,...,...,...,...,...,...
178,-1.9560245066,-1.8587399927,-1.1802878321,0.0736963580,-0.2972608413,-1.6427250231,-1.3253157117,-0.7967759956,1.6012459572,5.2631654798
179,-1.5027974892,-1.4989514798,-1.1834616279,0.0740820259,-0.2972608413,-1.6427250231,-1.3380529792,-0.6531932364,0.8234865651,4.2533512134
180,-1.4260413008,-1.4185731950,-1.1422022825,0.0739198932,-0.2972608413,-1.6427250231,-1.4903884747,-0.7598419576,0.5007949880,4.0766639685
181,-1.2341508298,-1.1736108032,-1.0247718378,0.0741810268,0.2055822641,-1.8694678239,0.8024865408,-0.0513030885,-0.1935302982,3.2991999510


In [21]:
print(adj_r2)

0.4498918028841148


In [20]:
lm.coef_

AttributeError: module 'sklearn.linear_model' has no attribute 'coef_'

In [None]:
sfs = SFS(lm, k_features=8, )

In [41]:
data2.scaled.corr()

Unnamed: 0,has_computer,with_internet,sixtyfive_and_older,number_providers,mean_lowest_cost,mean_d_mbps,mean_u_mbps,mean_lat_ms,index
has_computer,1.0,0.9894357549,0.6307362243,-0.1158696673,0.1088418476,0.3055403227,0.1964818898,-0.2013067357,-0.1078133529
with_internet,0.9894357549,1.0,0.6252920849,-0.1288266448,0.1364057855,0.3450733917,0.1990615088,-0.2367166307,-0.135243841
sixtyfive_and_older,0.6307362243,0.6252920849,1.0,0.0529299215,0.0505012498,-0.0423267904,-0.1692074005,0.1020176554,0.1540729623
number_providers,-0.1158696673,-0.1288266448,0.0529299215,1.0,-0.3294582912,-0.4802079801,-0.0162728201,0.4468024575,0.9267304854
mean_lowest_cost,0.1088418476,0.1364057855,0.0505012498,-0.3294582912,1.0,0.1966428248,0.0843635563,-0.1810913885,-0.5978554629
mean_d_mbps,0.3055403227,0.3450733917,-0.0423267904,-0.4802079801,0.1966428248,1.0,0.4043996639,-0.8315610286,-0.6082023817
mean_u_mbps,0.1964818898,0.1990615088,-0.1692074005,-0.0162728201,0.0843635563,0.4043996639,1.0,-0.4211462822,-0.133717278
mean_lat_ms,-0.2013067357,-0.2367166307,0.1020176554,0.4468024575,-0.1810913885,-0.8315610286,-0.4211462822,1.0,0.5545394802
index,-0.1078133529,-0.135243841,0.1540729623,0.9267304854,-0.5978554629,-0.6082023817,-0.133717278,0.5545394802,1.0


In [None]:
dat

In [38]:
list(dataset.columns)

['has_computer',
 'with_internet',
 'sixtyfive_and_older',
 'median_income',
 'number_providers',
 'mean_lowest_cost',
 'mean_d_mbps',
 'mean_u_mbps',
 'mean_lat_ms']

In [35]:
-(data.weights)

has_computer           0.4928275307
with_internet          0.3898000846
sixtyfive_and_older   -0.4958515824
median_income         -0.0360867754
number_providers      -0.0458754832
mean_lowest_cost      -0.1168199233
mean_d_mbps            0.2830632037
mean_u_mbps            0.1683880828
mean_lat_ms           -0.0083437527
dtype: float64

In [34]:
-(data2.weights)

has_computer          -0.0375724530
with_internet          0.0657226189
sixtyfive_and_older   -0.1736790963
number_providers      -0.8881681625
mean_lowest_cost       0.4067865234
mean_d_mbps            0.2283296702
mean_u_mbps           -0.0132305210
mean_lat_ms           -0.0062663003
dtype: float64