# Eastern Washington Digital Equity

## Nicholas Tran

# Preparation

## Import The Modules

In [30]:
import numpy as np  # matrix and array manipulation
import pandas as pd  # dataframe manipulation
import plotly.express as px
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from pingouin import cronbach_alpha
from scipy.stats import pearsonr
from sklearn import linear_model as lm
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  # scale the data
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from factor_analyzer.factor_analyzer import (
    calculate_kmo,
)  # get measure of sampling adequacy

# initialize the scaler
scaler = StandardScaler()


# use this as a method in corr() to get the pearson p values
def pearsonr_pval(x, y):
    return pearsonr(x, y)[1]


# turn scientific notation into decimals
pd.options.display.float_format = "{:.10f}".format

## Import The Dataset

In [31]:
dataset1 = pd.read_csv("../app/data/combined_data.csv")
dataset = dataset1.drop(
    columns=[
        "GEOID",
        "tract",
        "has_computer",
        "smartphone",
        "desktop_or_laptop",
        "tablet_or_portable",
        "foreign_born",
        "broadband",
        "median_income",
        "naturalized_citizen",
        # "native_citizen",
        "desktop_or_laptop_only",
        "satellite",
        "dial_up",
        "other_internet_service",
        "not_citizen",
        "no_internet_access",
        "number_providers",
        "mean_income",
        # "no_computer",
        # "internet_subscription",
        # "work_from_home",
        # "mean_lat_ms",
        "mean_u_mbps",
        "access_with_no_subscription",
        "sixty_five_and_older",
        "lowest_cost",
        "smartphone_only"
    ]
)
dataset.head()

Unnamed: 0,native_citizen,work_from_home,no_computer,internet_subscription,mean_d_mbps,mean_lat_ms
0,2484,82,76,910,62.6586413043,42.6413043478
1,1623,47,64,487,61.9772073171,60.9024390244
2,1064,3,183,327,88.0868093023,60.1674418605
3,1669,26,55,515,87.0107878788,44.4242424242
4,1841,110,24,557,74.1417594937,37.8924050633


## Create A Class for The Data

In [32]:
class PCA:
    """Input a df and get many things back.
    https://stackoverflow.com/questions/13224362/principal-component-analysis-pca-in-python
    """

    def __init__(self, df):
        import numpy as np
        from scipy import linalg as LA

        self.data = df

        # scale data
        self.scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

        # kmo, total kmo
        self.kmo, self.total_kmo = calculate_kmo(self.scaled)
        self.kmo = pd.DataFrame(self.data.columns, self.kmo).reset_index()
        self.kmo = self.kmo.rename(columns={"index": "KMO", 0: "Variables"})

        # center data
        self.center = self.scaled.apply(lambda x: x - x.mean())

        # covariance
        self.cov = pd.DataFrame(
            np.cov(self.center, rowvar=False),
            columns=self.scaled.columns,
            index=self.scaled.columns,
        )

        # eigenvalues and loadings(eigenvectors)
        self.eigenvalues, self.loadings = LA.eigh(self.cov)

        # sort eigenvalues and loadings from
        sorter = np.argsort(self.eigenvalues)[::-1]
        self.loadings = self.loadings[:, sorter]
        self.eigenvalues = self.eigenvalues[sorter]

        pc_list = ["pc" + str(i + 1) for i in range(len(self.eigenvalues))]

        # turn into dataframe
        self.loadings = pd.DataFrame(
            self.loadings, index=self.scaled.columns, columns=pc_list
        )
        self.eigenvalues = pd.DataFrame(self.eigenvalues, index=pc_list, columns=["eigenvalues"])

        # pca scores - scaled data * loadings
        self.scores = self.scaled @ self.loadings

        # percent explained
        explained_variance = self.eigenvalues / self.eigenvalues.sum() * 100
        self.percent_explained = pd.DataFrame(explained_variance).round(2)

        self.percent_explained[
            "cumulative_explained_variance"
        ] = self.percent_explained.cumsum().round(2)
        self.percent_explained.columns.values[0] = "explained_variance"

        # scree plot
        self.scree = (
            px.line(
                self.percent_explained,
                x=pc_list,
                y="cumulative_explained_variance",
                text="cumulative_explained_variance",
                color=px.Constant("cumulative explained variance"),
            )
            .update_traces(textposition="top left")
            .add_bar(
                x=pc_list,
                y=self.percent_explained.explained_variance,
                name="explained variance",
                text=self.percent_explained.explained_variance,
            )
        )

    def calculate_weights(self, number_of_components):
        """calculate coefficients using your eigenvalues. Multiplies each row by the respective
        eigenvalue. Row 1 of loadings will be multiplied by eigenvalue 1. Row 2 with 2. You may
        only use this when you have at least 2 pcs.

        Args:
            number_of_components (int): number of pcs you want to use.

        Returns:
            DataFrame: Returns a dataframe of weights.
        """
        weights = (
            self.loadings.iloc[:, 0:number_of_components]
            .mul(
                [
                    float(self.eigenvalues.iloc[i, :])
                    for i in range(len(self.eigenvalues))
                ],
                axis=0,
            )
            .sum(axis=1)
        )
        return weights


data = PCA(dataset)
data.weights = data.calculate_weights(3)
data.scaled["index"] = data.scaled @ data.weights


In [33]:
pd.DataFrame(data.weights).reset_index().rename(columns={0:"coefficients"})

Unnamed: 0,index,coefficients
0,native_citizen,-0.4074733019
1,work_from_home,-0.6330107281
2,no_computer,0.9094260191
3,internet_subscription,-0.130131756
4,mean_d_mbps,-0.1624212828
5,mean_lat_ms,0.1133194251


In [34]:
data.percent_explained

Unnamed: 0,explained_variance,cumulative_explained_variance
pc1,42.34,42.34
pc2,26.84,69.18
pc3,18.02,87.2
pc4,8.3,95.5
pc5,2.62,98.12
pc6,1.88,100.0


In [35]:
data.eigenvalues


Unnamed: 0,eigenvalues
pc1,2.554640589
pc2,1.6192338067
pc3,1.087013658
pc4,0.5006663731
pc5,0.1577884397
pc6,0.1136241664


In [36]:
data.percent_explained


Unnamed: 0,explained_variance,cumulative_explained_variance
pc1,42.34,42.34
pc2,26.84,69.18
pc3,18.02,87.2
pc4,8.3,95.5
pc5,2.62,98.12
pc6,1.88,100.0


In [37]:
data.scree


https://stackoverflow.com/questions/52741236/how-to-calculate-p-values-for-pairwise-correlation-of-columns-in-pandas

In [38]:
# set y as index. we want to predict the index
y = data.scaled["index"]

# set x as the data we will use to predict y
X = data.scaled.drop("index", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=55
)
X_test, X_valid, y_test, y_valid = train_test_split(
    X_test, y_test, test_size=0.4, random_state=55
)


In [39]:
lr = lm.LinearRegression()
sfs = SFS(
    lr,
    k_features="parsimonious",
    verbose=0,
    forward=True,
    scoring="r2",  # picks model on r2
    cv=5,
    n_jobs=-1,
)

sfs.fit(X_train, y_train)


In [40]:
results = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
results


Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(1,)","[0.5533209790367973, 0.3333039813572155, 0.479...",0.4426359694,"(work_from_home,)",0.1012366728,0.0787655708,0.0393827854
2,"(1, 2)","[0.9195581946154544, 0.6251966287094386, 0.878...",0.8264507723,"(work_from_home, no_computer)",0.1417874606,0.1103154613,0.0551577307
3,"(1, 2, 3)","[0.9769294531619434, 0.9024075290011476, 0.978...",0.9416578317,"(work_from_home, no_computer, internet_subscri...",0.0536605075,0.0417496979,0.0208748489
4,"(1, 2, 3, 4)","[0.9911452066176184, 0.9785763887263819, 0.991...",0.9791717726,"(work_from_home, no_computer, internet_subscri...",0.0203138001,0.0158048266,0.0079024133
5,"(0, 1, 2, 3, 4)","[0.9987663456357969, 0.9921699592514267, 0.998...",0.9971712704,"(native_citizen, work_from_home, no_computer, ...",0.0032711932,0.0025450994,0.0012725497
6,"(0, 1, 2, 3, 4, 5)","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"(native_citizen, work_from_home, no_computer, ...",0.0,0.0,0.0


In [41]:
largest_before_1 = results[results["avg_score"] != 1].tail(1).index.to_list()

In [42]:
largest_before_1 = largest_before_1[0] - 1

In [43]:
model_vars = list(results.iloc[largest_before_1, 3])
model_vars

['native_citizen',
 'work_from_home',
 'no_computer',
 'internet_subscription',
 'mean_d_mbps']

In [44]:
final_X = data.scaled[model_vars]
final_y = data.scaled["index"]
model = lr
model.fit(final_X, final_y)
r2 = model.score(final_X, final_y)
observations = final_X.shape[0]
predictors = final_X.shape[1]
adj_r2 = 1 - (1 - r2) * (observations - 1) / (observations - predictors - 1)
print(f"adjr2: {adj_r2}")


adjr2: 0.9979151735655256


In [45]:
fin_df = pd.DataFrame(-model.coef_, model.feature_names_in_)
fin_df["vif"] = [VIF(final_X.values, i) for i in range(final_X.shape[1])]
fin_df = fin_df.rename(columns={0:"coefficient"})
fin_df

Unnamed: 0,coefficient,vif
native_citizen,0.3859739398,4.658647343
work_from_home,0.6355265332,1.4293249699
no_computer,-0.9041364371,1.1324644851
internet_subscription,0.1404598355,4.8183229193
mean_d_mbps,0.2582306521,1.1855445756


In [46]:
coefficients = -model.coef_
final_vars = list(model.feature_names_in_)
final_data = data.scaled[final_vars]
final_data["index"] = final_data.mul(coefficients).sum(axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [47]:
final_data

Unnamed: 0,native_citizen,work_from_home,no_computer,internet_subscription,mean_d_mbps,index
0,-0.7793647066,-0.5115079191,-0.3341503606,-0.7430128489,-1.6263184336,-0.8481025365
1,-1.4274799394,-0.8676949403,-0.4867380487,-1.5525370030,-1.6343848222,-1.3024529588
2,-1.8482655551,-1.3154729099,1.0264231913,-1.8587399927,-1.3253157117,-3.0807423372
3,-1.3928535739,-1.0814071531,-0.6011788147,-1.4989514798,-1.3380529792,-1.2373892204
4,-1.2633810767,-0.2265583020,-0.9953636755,-1.4185731950,-1.4903884747,-0.3157879620
...,...,...,...,...,...,...
178,0.2112504459,-1.0407000649,-0.6647570181,0.6119353806,0.1229907201,0.1388880212
179,-0.1320022209,0.0990984031,0.2380534696,-0.1535720937,-0.8491271755,-0.4440439438
180,0.0622065248,-0.8269878522,0.4923662831,-0.3870518734,-1.8089348075,-1.4682165795
181,0.2534042822,0.2313964396,0.5305132051,0.0301497001,-0.3223962253,-0.3138080585


In [48]:
cronbach_alpha(final_data)[0]

0.6903825896668394

In [49]:
final_data

Unnamed: 0,native_citizen,work_from_home,no_computer,internet_subscription,mean_d_mbps,index
0,-0.7793647066,-0.5115079191,-0.3341503606,-0.7430128489,-1.6263184336,-0.8481025365
1,-1.4274799394,-0.8676949403,-0.4867380487,-1.5525370030,-1.6343848222,-1.3024529588
2,-1.8482655551,-1.3154729099,1.0264231913,-1.8587399927,-1.3253157117,-3.0807423372
3,-1.3928535739,-1.0814071531,-0.6011788147,-1.4989514798,-1.3380529792,-1.2373892204
4,-1.2633810767,-0.2265583020,-0.9953636755,-1.4185731950,-1.4903884747,-0.3157879620
...,...,...,...,...,...,...
178,0.2112504459,-1.0407000649,-0.6647570181,0.6119353806,0.1229907201,0.1388880212
179,-0.1320022209,0.0990984031,0.2380534696,-0.1535720937,-0.8491271755,-0.4440439438
180,0.0622065248,-0.8269878522,0.4923662831,-0.3870518734,-1.8089348075,-1.4682165795
181,0.2534042822,0.2313964396,0.5305132051,0.0301497001,-0.3223962253,-0.3138080585


In [50]:
final_data = dataset1[["GEOID", "tract"]].join(final_data)

In [51]:
# final_data.to_csv("../app/data/index_data4.csv", index=False)