# Eastern Washington Digital Equity

## Nicholas Tran

# Preparation

## Import The Modules

In [1]:
import numpy as np  # matrix and array manipulation
import pandas as pd  # dataframe manipulation
import plotly.express as px # plotting
from pingouin import cronbach_alpha # cronbach alpha
from scipy.stats import pearsonr # pearson correlation
from sklearn import linear_model as lm # linear regression 
from sklearn.feature_selection import SequentialFeatureSelector as SFS # to select features 
from sklearn.preprocessing import StandardScaler  # scale the data
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from factor_analyzer.factor_analyzer import (
    calculate_kmo,
)  # get measure of sampling adequacy

# initialize the scaler
scaler = StandardScaler()

# use this as a method in corr() to get the pearson p values
def pearsonr_pval(x, y):
    return pearsonr(x, y)[1]
import statsmodels.api as sm

# turn scientific notation into decimals
pd.options.display.float_format = "{:.10f}".format


KeyboardInterrupt: 

## Import The Dataset

In [2]:
dataset1 = pd.read_csv("../app/data/combined_data_new.csv")

dataset = dataset1[
    [
        "native_citizen",
        "work_from_home",
        "no_computer",
        "internet_subscription",
        "mean_d_mbps",
        "mean_lat_ms",
    ]
]


dataset.head()

Unnamed: 0,native_citizen,work_from_home,no_computer,internet_subscription,mean_d_mbps,mean_lat_ms
0,2484,82,76,910,62.6586413043,42.6413043478
1,1623,47,64,487,61.9772073171,60.9024390244
2,1064,3,183,327,88.0868093023,60.1674418605
3,1669,26,55,515,87.0107878788,44.4242424242
4,1841,110,24,557,74.1417594937,37.8924050633


## Create A Class for The Data

In [3]:
class PCA:
    """Input a df and get many things back.
    https://stackoverflow.com/questions/13224362/principal-component-analysis-pca-in-python
    """

    def __init__(self, df):
        import numpy as np
        from scipy import linalg as LA

        self.data = df

        # scale data
        self.scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

        # kmo, total kmo
        self.kmo, self.total_kmo = calculate_kmo(self.scaled)
        self.kmo = pd.DataFrame(self.data.columns, self.kmo).reset_index()
        self.kmo = self.kmo.rename(columns={"index": "KMO", 0: "Variables"})

        # center data
        self.center = self.scaled.apply(lambda x: x - x.mean())

        # covariance
        self.cov = pd.DataFrame(
            np.cov(self.center, rowvar=False),
            columns=self.scaled.columns,
            index=self.scaled.columns,
        )

        # eigenvalues and loadings(eigenvectors)
        self.eigenvalues, self.loadings = LA.eigh(self.cov)

        # sort eigenvalues and loadings from
        sorter = np.argsort(self.eigenvalues)[::-1]
        self.loadings = self.loadings[:, sorter]
        self.eigenvalues = self.eigenvalues[sorter]

        pc_list = ["pc" + str(i + 1) for i in range(len(self.eigenvalues))]

        # turn into dataframe
        self.loadings = pd.DataFrame(
            self.loadings, index=self.scaled.columns, columns=pc_list
        )
        self.eigenvalues = pd.DataFrame(
            self.eigenvalues, index=pc_list, columns=["eigenvalues"]
        )

        # pca scores - scaled data * loadings
        self.scores = self.scaled @ self.loadings

        # percent explained
        explained_variance = self.eigenvalues / self.eigenvalues.sum() * 100
        self.percent_explained = pd.DataFrame(explained_variance).round(2)

        self.percent_explained[
            "cumulative_explained_variance"
        ] = self.percent_explained.cumsum().round(2)
        self.percent_explained.columns.values[0] = "explained_variance"

        # scree plot
        self.scree = (
            px.line(
                self.percent_explained,
                x=pc_list,
                y="cumulative_explained_variance",
                text="cumulative_explained_variance",
                color=px.Constant("cumulative explained variance"),
            )
            .update_traces(textposition="top left")
            .add_bar(
                x=pc_list,
                y=self.percent_explained.explained_variance,
                name="explained variance",
                text=self.percent_explained.explained_variance,
            )
        )

    def calculate_weights(self, number_of_components):
        """calculate coefficients using your eigenvalues. Multiplies each row by the respective
        eigenvalue. Row 1 of loadings will be multiplied by eigenvalue 1. Row 2 with 2. You may
        only use this when you have at least 2 pcs.

        Args:
            number_of_components (int): number of pcs you want to use.

        Returns:
            DataFrame: Returns a dataframe of weights.
        """
        weights = (
            self.loadings.iloc[:, 0:number_of_components]
            .mul(
                [
                    float(self.eigenvalues.iloc[i, :])
                    for i in range(len(self.eigenvalues))
                ],
                axis=0,
            )
            .sum(axis=1)
        )
        return weights


data = PCA(dataset)

data.weights = data.calculate_weights(3)

data.scaled["index"] = data.scaled @ data.weights

# data.weights = (
#     pd.DataFrame(data.weights).reset_index().rename(columns={0: "coefficients"})
# )

In [4]:
data.weights

native_citizen          -0.4074733019
work_from_home          -0.6330107281
no_computer              0.9094260191
internet_subscription   -0.1301317560
mean_d_mbps             -0.1624212828
mean_lat_ms              0.1133194251
dtype: float64

In [5]:
data.scaled

Unnamed: 0,native_citizen,work_from_home,no_computer,internet_subscription,mean_d_mbps,mean_lat_ms,index
0,-0.7793647066,-0.5115079191,-0.3341503606,-0.7430128489,-1.6263184336,0.7354042894,0.7816491628
1,-1.4274799394,-0.8676949403,-0.4867380487,-1.5525370030,-1.6343848222,1.6375569339,1.3413281805
2,-1.8482655551,-1.3154729099,1.0264231913,-1.8587399927,-1.3253157117,1.6012459572,3.1578761382
3,-1.3928535739,-1.0814071531,-0.6011788147,-1.4989514798,-1.3380529792,0.8234865651,1.2110718116
4,-1.2633810767,-0.2265583020,-0.9953636755,-1.4185731950,-1.4903884747,0.5007949880,0.2364202986
...,...,...,...,...,...,...,...
178,0.2112504459,-1.0407000649,-0.6647570181,0.6119353806,0.1229907201,-0.3605116454,-0.1723134480
179,-0.1320022209,0.0990984031,0.2380534696,-0.1535720937,-0.8491271755,0.8787309027,0.4650272598
180,0.0622065248,-0.8269878522,0.4923662831,-0.3870518734,-1.8089348075,1.7956215829,1.4935714506
181,0.2534042822,0.2313964396,0.5305132051,0.0301497001,-0.3223962253,-0.6920936906,0.2027435198


In [6]:
data.loadings

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6
native_citizen,-0.5304821532,0.3226012348,0.0483777367,-0.3444218899,0.0362374517,0.7015961316
work_from_home,-0.3270214438,0.3870402176,-0.4509510317,0.7316641799,0.0559949198,-0.0378432176
no_computer,-0.1373524548,0.0894573888,0.8845230259,0.4313081476,0.0686957121,0.002208247
internet_subscription,-0.5591628836,0.2245219099,0.0747238653,-0.3434801603,-0.1938040725,-0.6897858266
mean_d_mbps,-0.4021231841,-0.5585252591,-0.0687126632,0.0146419327,0.7184474894,-0.0724145601
mean_lat_ms,0.3441130309,0.6129968481,0.0402081099,-0.2045027255,0.6611395478,-0.1589890801


In [7]:
data.eigenvalues

Unnamed: 0,eigenvalues
pc1,2.554640589
pc2,1.6192338067
pc3,1.087013658
pc4,0.5006663731
pc5,0.1577884397
pc6,0.1136241664


In [8]:
data.percent_explained

Unnamed: 0,explained_variance,cumulative_explained_variance
pc1,42.34,42.34
pc2,26.84,69.18
pc3,18.02,87.2
pc4,8.3,95.5
pc5,2.62,98.12
pc6,1.88,100.0


In [9]:
data.kmo

Unnamed: 0,KMO,Variables
0,0.5981748653,native_citizen
1,0.7612109755,work_from_home
2,0.4830733703,no_computer
3,0.6277870951,internet_subscription
4,0.5486351738,mean_d_mbps
5,0.5179792857,mean_lat_ms


In [10]:
data.scree

In [11]:
data.total_kmo

0.5891717365269227

In [12]:
data.weights


native_citizen          -0.4074733019
work_from_home          -0.6330107281
no_computer              0.9094260191
internet_subscription   -0.1301317560
mean_d_mbps             -0.1624212828
mean_lat_ms              0.1133194251
dtype: float64

In [13]:
data.weights

native_citizen          -0.4074733019
work_from_home          -0.6330107281
no_computer              0.9094260191
internet_subscription   -0.1301317560
mean_d_mbps             -0.1624212828
mean_lat_ms              0.1133194251
dtype: float64

In [14]:
data.percent_explained

Unnamed: 0,explained_variance,cumulative_explained_variance
pc1,42.34,42.34
pc2,26.84,69.18
pc3,18.02,87.2
pc4,8.3,95.5
pc5,2.62,98.12
pc6,1.88,100.0


In [15]:
data.eigenvalues


Unnamed: 0,eigenvalues
pc1,2.554640589
pc2,1.6192338067
pc3,1.087013658
pc4,0.5006663731
pc5,0.1577884397
pc6,0.1136241664


In [16]:
data.percent_explained


Unnamed: 0,explained_variance,cumulative_explained_variance
pc1,42.34,42.34
pc2,26.84,69.18
pc3,18.02,87.2
pc4,8.3,95.5
pc5,2.62,98.12
pc6,1.88,100.0


In [17]:
data.scree


In [18]:
# X = all the independent variables
X = data.scaled.iloc[:, :-1]

# y = index, or dependent variable
y = data.scaled.iloc[:, -1]

lr = lm.LinearRegression()

sfs = SFS( # defaults to forward 
    lr,
    n_features_to_select=5,
    n_jobs=-1, # since scoring is null, defaults to model.score
) # lr.score() will give us the r2

sfs.fit(X, y)
selected_variables = list(sfs.get_feature_names_out())

In [19]:
selected_variables

['native_citizen',
 'work_from_home',
 'no_computer',
 'internet_subscription',
 'mean_d_mbps']

In [20]:
X_new = data.scaled[selected_variables]
model = lr
model.fit(X_new, y)

In [21]:
model.feature_names_in_

array(['native_citizen', 'work_from_home', 'no_computer',
       'internet_subscription', 'mean_d_mbps'], dtype=object)

In [22]:
model.coef_

array([-0.38597394, -0.63552653,  0.90413644, -0.14045984, -0.25823065])

In [23]:
r2 = model.score(X_new, y)
observations = X_new.shape[0]
predictors = X_new.shape[1]
adj_r2 = 1 - (1 - r2) * (observations - 1) / (observations - predictors - 1)
print(f"adjr2: {adj_r2}")


adjr2: 0.9979151735655256


In [24]:
# results = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
# results


In [25]:
# largest_before_1 = results[results["avg_score"] != 1].tail(1).index.to_list()
# largest_before_1 = largest_before_1[0] - 1
# model_vars = list(results.iloc[largest_before_1, 3])
# model_vars

In [26]:
# model = lr
# model.fit(final_X, final_y)
# r2 = model.score(final_X, final_y)
# observations = final_X.shape[0]
# predictors = final_X.shape[1]
# adj_r2 = 1 - (1 - r2) * (observations - 1) / (observations - predictors - 1)
# print(f"adjr2: {adj_r2}")


In [27]:
fin_df = pd.DataFrame(-model.coef_, model.feature_names_in_)
fin_df["vif"] = [VIF(X_new.values, i) for i in range(X_new.shape[1])]
fin_df = fin_df.rename(columns={0: "coefficient"})
fin_df


Unnamed: 0,coefficient,vif
native_citizen,0.3859739398,4.658647343
work_from_home,0.6355265332,1.4293249699
no_computer,-0.9041364371,1.1324644851
internet_subscription,0.1404598355,4.8183229193
mean_d_mbps,0.2582306521,1.1855445756


In [28]:
coefficients = -model.coef_
final_vars = list(model.feature_names_in_)
final_data = data.scaled[final_vars]
final_data["index"] = final_data.mul(coefficients).sum(axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
final_data

Unnamed: 0,native_citizen,work_from_home,no_computer,internet_subscription,mean_d_mbps,index
0,-0.7793647066,-0.5115079191,-0.3341503606,-0.7430128489,-1.6263184336,-0.8481025365
1,-1.4274799394,-0.8676949403,-0.4867380487,-1.5525370030,-1.6343848222,-1.3024529588
2,-1.8482655551,-1.3154729099,1.0264231913,-1.8587399927,-1.3253157117,-3.0807423372
3,-1.3928535739,-1.0814071531,-0.6011788147,-1.4989514798,-1.3380529792,-1.2373892204
4,-1.2633810767,-0.2265583020,-0.9953636755,-1.4185731950,-1.4903884747,-0.3157879620
...,...,...,...,...,...,...
178,0.2112504459,-1.0407000649,-0.6647570181,0.6119353806,0.1229907201,0.1388880212
179,-0.1320022209,0.0990984031,0.2380534696,-0.1535720937,-0.8491271755,-0.4440439438
180,0.0622065248,-0.8269878522,0.4923662831,-0.3870518734,-1.8089348075,-1.4682165795
181,0.2534042822,0.2313964396,0.5305132051,0.0301497001,-0.3223962253,-0.3138080585


In [30]:
cronbach_alpha(final_data)[0]

0.6903825896668394

In [31]:
final_data

Unnamed: 0,native_citizen,work_from_home,no_computer,internet_subscription,mean_d_mbps,index
0,-0.7793647066,-0.5115079191,-0.3341503606,-0.7430128489,-1.6263184336,-0.8481025365
1,-1.4274799394,-0.8676949403,-0.4867380487,-1.5525370030,-1.6343848222,-1.3024529588
2,-1.8482655551,-1.3154729099,1.0264231913,-1.8587399927,-1.3253157117,-3.0807423372
3,-1.3928535739,-1.0814071531,-0.6011788147,-1.4989514798,-1.3380529792,-1.2373892204
4,-1.2633810767,-0.2265583020,-0.9953636755,-1.4185731950,-1.4903884747,-0.3157879620
...,...,...,...,...,...,...
178,0.2112504459,-1.0407000649,-0.6647570181,0.6119353806,0.1229907201,0.1388880212
179,-0.1320022209,0.0990984031,0.2380534696,-0.1535720937,-0.8491271755,-0.4440439438
180,0.0622065248,-0.8269878522,0.4923662831,-0.3870518734,-1.8089348075,-1.4682165795
181,0.2534042822,0.2313964396,0.5305132051,0.0301497001,-0.3223962253,-0.3138080585


In [32]:
final_data = dataset1[["GEOID", "tract"]].join(final_data)

In [33]:
final_data.to_csv("../app/data/index_data4.csv", index=False)