# Eastern Washington Digital Equity

## Nicholas Tran

# Preparation

## Import The Modules

In [1271]:
import numpy as np  # matrix and array manipulation
import pandas as pd  # dataframe manipulation
import plotly.express as px
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from pingouin import cronbach_alpha
from scipy.stats import pearsonr
from sklearn import linear_model as lm
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  # scale the data
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from factor_analyzer.factor_analyzer import (
    calculate_kmo,
)  # get measure of sampling adequacy

# initialize the scaler
scaler = StandardScaler()


# use this as a method in corr() to get the pearson p values
def pearsonr_pval(x, y):
    return pearsonr(x, y)[1]


# turn scientific notation into decimals
pd.options.display.float_format = "{:.10f}".format

## Import The Dataset

In [1272]:
dataset = pd.read_csv("../data/combined_data.csv")
dataset = dataset.drop(
    columns=[
        "GEOID",
        "tract",
        # "has_computer",
        "smartphone",
        "desktop_or_laptop",
        "tablet_or_portable",
        "foreign_born",
        "broadband",
        # "median_income",
        "naturalized_citizen",
        "native_citizen",
        "desktop_or_laptop_only",
        "satellite",
        "dial_up",
        "other_internet_service",
        "not_citizen",
        "no_internet_access",
        "number_providers",
        "mean_income",
        "no_computer",
        # "internet_subscription",
        "work_from_home",
        "mean_lat_ms",
        "mean_u_mbps",
        "access_with_no_subscription",
        "sixty_five_and_older"
    ]
)
dataset.head()

Unnamed: 0,smartphone_only,has_computer,internet_subscription,median_income,mean_d_mbps,lowest_cost
0,153,1023,910,52589.0,62.6586413043,30.0
1,68,548,487,59886.0,61.9772073171,30.0
2,76,318,327,39928.0,88.0868093023,30.0
3,132,566,515,58884.0,87.0107878788,30.0
4,97,608,557,50915.0,74.1417594937,30.0


In [1273]:
dataset.columns

Index(['smartphone_only', 'has_computer', 'internet_subscription',
       'median_income', 'mean_d_mbps', 'lowest_cost'],
      dtype='object')

## Create A Class for The Data

In [1274]:
class PCA:
    """Input a df and get many things back.
    https://stackoverflow.com/questions/13224362/principal-component-analysis-pca-in-python
    """

    def __init__(self, df):
        import numpy as np
        from scipy import linalg as LA

        self.data = df

        # scale data
        self.scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

        # kmo, total kmo
        self.kmo, self.total_kmo = calculate_kmo(self.scaled)

        # center data
        self.center = self.scaled.apply(lambda x: x - x.mean())

        # covariance
        self.cov = pd.DataFrame(
            np.cov(self.center, rowvar=False),
            columns=self.scaled.columns,
            index=self.scaled.columns,
        )

        # eigenvalues and loadings(eigenvectors)
        self.eigenvalues, self.loadings = LA.eigh(self.cov)

        # sort eigenvalues and loadings from
        sorter = np.argsort(self.eigenvalues)[::-1]
        self.loadings = self.loadings[:, sorter]
        self.eigenvalues = self.eigenvalues[sorter]

        pc_list = ["pc" + str(i + 1) for i in range(len(self.eigenvalues))]

        # turn into dataframe
        self.loadings = pd.DataFrame(
            self.loadings, index=self.scaled.columns, columns=pc_list
        )
        self.eigenvalues = pd.DataFrame(self.eigenvalues, index=pc_list)

        # pca scores - scaled data * loadings
        self.scores = self.scaled @ self.loadings

        # percent explained
        explained_variance = self.eigenvalues / self.eigenvalues.sum() * 100
        self.percent_explained = pd.DataFrame(explained_variance).round(2)

        self.percent_explained[
            "cumulative_explained_variance"
        ] = self.percent_explained.cumsum().round(2)
        self.percent_explained.columns.values[0] = "explained_variance"

        # scree plot
        self.scree = (
            px.line(
                self.percent_explained,
                x=pc_list,
                y="cumulative_explained_variance",
                text="cumulative_explained_variance",
                color=px.Constant("cumulative explained variance"),
            )
            .update_traces(textposition="top left")
            .add_bar(
                x=pc_list,
                y=self.percent_explained.explained_variance,
                name="explained variance",
                text=self.percent_explained.explained_variance,
            )
        )

    def calculate_weights(self, number_of_components):
        """calculate coefficients using your eigenvalues. Multiplies each row by the respective
        eigenvalue. Row 1 of loadings will be multiplied by eigenvalue 1. Row 2 with 2. You may
        only use this when you have at least 2 pcs.

        Args:
            number_of_components (int): number of pcs you want to use.

        Returns:
            DataFrame: Returns a dataframe of weights.
        """
        weights = (
            self.loadings.iloc[:, 0:number_of_components]
            .mul(
                [
                    float(self.eigenvalues.iloc[i, :])
                    for i in range(len(self.eigenvalues))
                ],
                axis=0,
            )
            .sum(axis=1)
        )
        return weights


data = PCA(dataset)
# data.weights = data.calculate_weights(5)
data.scaled["index"] = data.scores.pc1


https://stackoverflow.com/questions/52741236/how-to-calculate-p-values-for-pairwise-correlation-of-columns-in-pandas

In [1275]:
data.scaled.corr(pearsonr_pval).iloc[:, -1:]


Unnamed: 0,index
smartphone_only,1e-10
has_computer,0.0
internet_subscription,0.0
median_income,0.2116759784
mean_d_mbps,0.0
lowest_cost,2e-09
index,1.0


In [1276]:
pvals = data.scaled.corr(pearsonr_pval).loc[
    data.scaled.corr(pearsonr_pval)["index"] >= 0.05
]
pvals.loc[:, ["index"]]


Unnamed: 0,index
median_income,0.2116759784
index,1.0


In [1277]:
insig_vars = list(pvals.index)
# remove index col
del insig_vars[-1]
insig_vars

['median_income']

In [1278]:
data2 = PCA(dataset.drop(insig_vars, axis=1))
# data2.weights = -data2.calculate_weights(5)
data.scaled["index"] = data2.scores.pc1


In [1279]:
# set y as index. we want to predict the index
y = data.scaled["index"]

# set x as the data we will use to predict y
X = data.scaled.drop("index", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=55
)
X_test, X_valid, y_test, y_valid = train_test_split(
    X_test, y_test, test_size=0.4, random_state=55
)


In [1280]:
print(X_train.shape, X_test.shape, X_valid.shape)


(91, 6) (55, 6) (37, 6)


In [1281]:
lr = lm.LinearRegression()
sfs = SFS(
    lr,
    k_features="parsimonious",
    verbose=0,
    forward=True,
    scoring="r2",  # picks model on r2
    cv=5,
    n_jobs=-1,
)

sfs.fit(X_train, y_train)


In [1282]:
results = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
results


Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(2,)","[0.8636428605026578, 0.8625727695477095, 0.792...",0.7826071473,"(internet_subscription,)",0.0979585021,0.0762150426,0.0381075213
2,"(2, 4)","[0.9145560770839453, 0.9308349949542637, 0.901...",0.9050131344,"(internet_subscription, mean_d_mbps)",0.0283268459,0.0220392485,0.0110196242
3,"(0, 2, 4)","[0.9705331012079685, 0.9742483151059728, 0.974...",0.964792554,"(smartphone_only, internet_subscription, mean_...",0.0160334716,0.0124745856,0.0062372928
4,"(0, 2, 4, 5)","[0.9930161087428032, 0.9986653748346885, 0.997...",0.9966379223,"(smartphone_only, internet_subscription, mean_...",0.0024756792,0.0019261625,0.0009630813
5,"(0, 1, 2, 4, 5)","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"(smartphone_only, has_computer, internet_subsc...",0.0,0.0,0.0
6,"(0, 1, 2, 3, 4, 5)","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"(smartphone_only, has_computer, internet_subsc...",0.0,0.0,0.0


In [1283]:
largest_before_1 = results[results["avg_score"] != 1].tail(1).index.to_list()

In [1284]:
largest_before_1 = largest_before_1[0] - 1

In [1285]:
model_vars = list(results.iloc[largest_before_1, 3])
model_vars

['smartphone_only', 'internet_subscription', 'mean_d_mbps', 'lowest_cost']

In [1286]:
final_X = data.scaled[model_vars]
final_y = data.scaled["index"]
model = lr
model.fit(final_X, final_y)
r2 = model.score(final_X, final_y)
observations = final_X.shape[0]
predictors = final_X.shape[1]
adj_r2 = 1 - (1 - r2) * (observations - 1) / (observations - predictors - 1)
print(f"adjr2: {adj_r2}")


adjr2: 0.9977256954801569


In [1287]:
fin_df = pd.DataFrame(-model.coef_, model.feature_names_in_)
fin_df["vif"] = [VIF(final_X.values, i) for i in range(final_X.shape[1])]
fin_df

Unnamed: 0,0,vif
smartphone_only,0.3404676945,1.1133086998
internet_subscription,1.1745197719,1.2547049713
mean_d_mbps,0.3574268631,1.4083506853
lowest_cost,0.2393861889,1.2807586769


In [1288]:
coefficients = -model.coef_
final_vars = list(model.feature_names_in_)
final_data = data.scaled[final_vars]
final_data["index"] = final_data.mul(coefficients).sum(axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [1289]:
final_data

Unnamed: 0,smartphone_only,internet_subscription,mean_d_mbps,lowest_cost,index
0,0.5485540934,-0.7430128489,-1.6263184336,-0.9471266092,-1.4939372599
1,-0.4644767653,-1.5525370030,-1.6343848222,-0.9471266092,-2.7925268096
2,-0.3691326845,-1.8587399927,-1.3253157117,-0.9471266092,-3.0092370933
3,0.2982758812,-1.4989514798,-1.3380529792,-0.9471266092,-2.3639799570
4,-0.1188544723,-1.4185731950,-1.4903884747,-0.9471266092,-2.4660422804
...,...,...,...,...,...
178,-0.0354284016,0.6119353806,0.1229907201,-2.1620799491,0.2330560855
179,-0.7743450279,-0.1535720937,-0.8491271755,-2.1620799491,-1.2650858688
180,-0.3333786542,-0.3870518734,-1.8089348075,-2.1620799491,-1.7322387128
181,-0.1903625329,0.0301497001,-0.3223962253,-2.1620799491,-0.6622060245


In [1290]:
cronbach_alpha(final_data)

(0.772119544523206, array([0.715, 0.82 ]))