# Eastern Washington Digital Equity

## Nicholas Tran

# Preparation

## Import The Modules

In [9]:
import numpy as np  # matrix and array manipulation
import pandas as pd  # dataframe manipulation
import plotly.express as px # plotting
from pingouin import cronbach_alpha # cronbach alpha
from scipy.stats import pearsonr # pearson correlation
from sklearn import linear_model as lm # linear regression 
from sklearn.feature_selection import SequentialFeatureSelector as SFS # to select features 
from sklearn.preprocessing import StandardScaler  # scale the data
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from factor_analyzer.factor_analyzer import (
    calculate_kmo,
)  # get measure of sampling adequacy

# initialize the scaler
scaler = StandardScaler()

# use this as a method in corr() to get the pearson p values
def pearsonr_pval(x, y):
    return pearsonr(x, y)[1]


# turn scientific notation into decimals
pd.options.display.float_format = "{:.10f}".format


## Import The Dataset

In [13]:
dataset1 = pd.read_csv("../app/data/combined_data_new.csv")

dataset = dataset1[
    [
        "native_citizen",
        "work_from_home",
        "no_computer",
        "internet_subscription",
        "mean_d_mbps",
        "mean_lat_ms",
    ]
]


dataset.head()

Unnamed: 0,native_citizen,work_from_home,no_computer,internet_subscription,mean_d_mbps,mean_lat_ms
0,2484,82,76,910,62.6586413043,42.6413043478
1,1623,47,64,487,61.9772073171,60.9024390244
2,1064,3,183,327,88.0868093023,60.1674418605
3,1669,26,55,515,87.0107878788,44.4242424242
4,1841,110,24,557,74.1417594937,37.8924050633


## Create A Class for The Data

In [14]:
class PCA:
    """Input a df and get many things back.
    https://stackoverflow.com/questions/13224362/principal-component-analysis-pca-in-python
    """

    def __init__(self, df):
        import numpy as np
        from scipy import linalg as LA

        self.data = df

        # scale data
        self.scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

        # kmo, total kmo
        self.kmo, self.total_kmo = calculate_kmo(self.scaled)
        self.kmo = pd.DataFrame(self.data.columns, self.kmo).reset_index()
        self.kmo = self.kmo.rename(columns={"index": "KMO", 0: "Variables"})

        # center data
        self.center = self.scaled.apply(lambda x: x - x.mean())

        # covariance
        self.cov = pd.DataFrame(
            np.cov(self.center, rowvar=False),
            columns=self.scaled.columns,
            index=self.scaled.columns,
        )

        # eigenvalues and loadings(eigenvectors)
        self.eigenvalues, self.loadings = LA.eigh(self.cov)

        # sort eigenvalues and loadings from
        sorter = np.argsort(self.eigenvalues)[::-1]
        self.loadings = self.loadings[:, sorter]
        self.eigenvalues = self.eigenvalues[sorter]

        pc_list = ["pc" + str(i + 1) for i in range(len(self.eigenvalues))]

        # turn into dataframe
        self.loadings = pd.DataFrame(
            self.loadings, index=self.scaled.columns, columns=pc_list
        )
        self.eigenvalues = pd.DataFrame(
            self.eigenvalues, index=pc_list, columns=["eigenvalues"]
        )

        # pca scores - scaled data * loadings
        self.scores = self.scaled @ self.loadings

        # percent explained
        explained_variance = self.eigenvalues / self.eigenvalues.sum() * 100
        self.percent_explained = pd.DataFrame(explained_variance).round(2)

        self.percent_explained[
            "cumulative_explained_variance"
        ] = self.percent_explained.cumsum().round(2)
        self.percent_explained.columns.values[0] = "explained_variance"

        # scree plot
        self.scree = (
            px.line(
                self.percent_explained,
                x=pc_list,
                y="cumulative_explained_variance",
                text="cumulative_explained_variance",
                color=px.Constant("cumulative explained variance"),
            )
            .update_traces(textposition="top left")
            .add_bar(
                x=pc_list,
                y=self.percent_explained.explained_variance,
                name="explained variance",
                text=self.percent_explained.explained_variance,
            )
        )

    def calculate_weights(self, number_of_components):
        """calculate coefficients using your eigenvalues. Multiplies each row by the respective
        eigenvalue. Row 1 of loadings will be multiplied by eigenvalue 1. Row 2 with 2. You may
        only use this when you have at least 2 pcs.

        Args:
            number_of_components (int): number of pcs you want to use.

        Returns:
            DataFrame: Returns a dataframe of weights.
        """
        weights = (
            self.loadings.iloc[:, 0:number_of_components]
            .mul(
                [
                    float(self.eigenvalues.iloc[i, :])
                    for i in range(len(self.eigenvalues))
                ],
                axis=0,
            )
            .sum(axis=1)
        )
        return weights


data = PCA(dataset)

data.weights = data.calculate_weights(3)

data.scaled["index"] = data.scaled @ data.weights

# data.weights = (
#     pd.DataFrame(data.weights).reset_index().rename(columns={0: "coefficients"})
# )

In [18]:
data.weights

native_citizen          -0.4074733019
work_from_home          -0.6330107281
no_computer              0.9094260191
internet_subscription   -0.1301317560
mean_d_mbps             -0.1624212828
mean_lat_ms              0.1133194251
dtype: float64

In [None]:
data.scaled

In [None]:
data.loadings

In [None]:
data.eigenvalues

In [None]:
data.percent_explained

In [None]:
data.kmo

In [None]:
data.scree

In [None]:
data.total_kmo

In [None]:
data.weights


In [None]:
data.weights

In [None]:
data.percent_explained

In [None]:
data.eigenvalues


In [None]:
data.percent_explained


In [None]:
data.scree


In [None]:
# X = all the independent variables
X = data.scaled.iloc[:, :-1]

# y = index, or dependent variable
y = data.scaled.iloc[:, -1]

lr = lm.LinearRegression()

sfs = SFS( # defaults to forward 
    lr,
    n_features_to_select=5,
    n_jobs=-1, # since scoring is null, defaults to model.score
) # lr.score() will give us the r2

sfs.fit(X, y)
selected_variables = list(sfs.get_feature_names_out())

In [None]:
selected_variables

In [None]:
X_new = data.scaled[selected_variables]
model = lr
model.fit(X_new, y)

In [None]:
model.feature_names_in_

In [None]:
model.coef_

In [None]:
r2 = model.score(X_new, y)
observations = X_new.shape[0]
predictors = X_new.shape[1]
adj_r2 = 1 - (1 - r2) * (observations - 1) / (observations - predictors - 1)
print(f"adjr2: {adj_r2}")


In [None]:
# results = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
# results


In [None]:
# largest_before_1 = results[results["avg_score"] != 1].tail(1).index.to_list()
# largest_before_1 = largest_before_1[0] - 1
# model_vars = list(results.iloc[largest_before_1, 3])
# model_vars

In [None]:
# model = lr
# model.fit(final_X, final_y)
# r2 = model.score(final_X, final_y)
# observations = final_X.shape[0]
# predictors = final_X.shape[1]
# adj_r2 = 1 - (1 - r2) * (observations - 1) / (observations - predictors - 1)
# print(f"adjr2: {adj_r2}")


In [None]:
fin_df = pd.DataFrame(-model.coef_, model.feature_names_in_)
fin_df["vif"] = [VIF(X_new.values, i) for i in range(X_new.shape[1])]
fin_df = fin_df.rename(columns={0: "coefficient"})
fin_df


In [None]:
coefficients = -model.coef_
final_vars = list(model.feature_names_in_)
final_data = data.scaled[final_vars]
final_data["index"] = final_data.mul(coefficients).sum(axis=1)

In [None]:
final_data

In [None]:
cronbach_alpha(final_data)[0]

In [None]:
final_data

In [None]:
final_data = dataset1[["GEOID", "tract"]].join(final_data)

In [None]:
final_data.to_csv("../app/data/index_data4.csv", index=False)