In [20]:
pip install -r requirements.txt

Collecting scikit-learn (from -r requirements.txt (line 5))
  Using cached scikit_learn-1.4.1.post1-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting statsmodels (from -r requirements.txt (line 6))
  Using cached statsmodels-0.14.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.5 kB)
Collecting scipy (from -r requirements.txt (line 7))
  Using cached scipy-1.13.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting matplotlib (from -r requirements.txt (line 8))
  Using cached matplotlib-3.8.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.8 kB)
Collecting joblib>=1.2.0 (from scikit-learn->-r requirements.txt (line 5))
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn->-r requirements.txt (line 5))
  Using cached threadpoolctl-3.4.0-py3-none-any.whl.metadata (13 kB)
Collecting patsy>=0.5.4 (from statsmodels->-r requirements.txt (line 6))
  Using cached patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Co

In [21]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from functools import reduce
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
import delta
import numpy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [30]:
# r-squared of two series

def r_squared(series_1, series_2):
    corr_matrix = numpy.corrcoef(series_1, series_2)
    corr = corr_matrix[0,1]
    R_sq = corr**2

    return R_sq

In [31]:
# r-squared of each column in a df to every other column

def r_squared_metrics(list_of_columns, df):
    r_squared_dict = {}
    for i in list_of_columns:
        series_i = [row[i] for row in df.select(col(i).cast('int')).collect()]
        
        for j in list_of_columns:
            series_j = [row[j] for row in df.select(col(j).cast('int')).collect()]
            
            if i != j:  # Avoid applying the function to the same element with itself
                label = f"r_{i}_{j}"  # Constructing the label
                mirror_label = f"r_{j}_{i}" 
                if mirror_label not in r_squared_dict:
                    r_squared_dict[label] = r_squared(series_i, series_j)

    return r_squared_dict

In [32]:
def split_by_middle_underscore(string_list):
    split_strings = []
    for s in string_list:
        parts = s.split('_')
        # Find the index to split on
        middle = len(parts) // 2 - 1
        # Split the string into two parts
        first_part = '_'.join(parts[:middle + 1])
        second_part = '_'.join(parts[middle + 1:])
        split_strings.append((first_part, second_part))
    return split_strings

In [34]:
def r_squared_metrics_current_iteration(list_of_columns, df):
    r_squared_dict = {}
    for i in list_of_columns:
        if '_' in i:  # Check if the column name contains an underscore
            series_i = [row[i] for row in df.select(col(i).cast('double')).collect()]
            
            for j in list_of_columns:
                if i != j:  # Check if the column name contains an underscore and is different from i
                    series_j = [row[j] for row in df.select(col(j).cast('double')).collect()]
                    label = f"r_{i}_{j}"  # Constructing the label
                    mirror_label = f"r_{j}_{i}" 
                    if mirror_label not in r_squared_dict:
                        r_squared_dict[label] = r_squared(series_i, series_j)

    return r_squared_dict

In [35]:
# split a dictionary into values above and below a certain threshold

def split_dict_by_threshold(threshold, dictionary):
    # Create a new dictionary for values above the threshold
    values_above_threshold = {key: value for key, value in dictionary.items() if value > threshold}

    # Remove values above the threshold from the original dictionary
    original_dictionary = {key: value for key, value in dictionary.items() if value <= threshold}

    return(original_dictionary, values_above_threshold)

In [38]:
# multinomial logistical regression with maximum likelihood estimation
# look into using sci-kit learn package

def multi_log_reg_w_mle(df, dependent_variable):

    final_pandas_df = df.toPandas()
    final_pandas_df = final_pandas_df.apply(pd.to_numeric, errors='coerce')
    
    # Assume 'df' is your DataFrame, 'Y' is the dependent variable, and the rest are independent variables 
    X = final_pandas_df.drop(dependent_variable, axis=1)
    y = final_pandas_df[dependent_variable]


    # Add a constant to the model (if you want an intercept)
    X = sm.add_constant(X)

    # Fit the model
    model = sm.MNLogit(y, X)
    result = model.fit()

    print(result.summary())


    # Create the summary dictionary for this dependent variable
    summary = {
        'coefficients': result.params.to_dict(),
        'p_values': result.pvalues.to_dict(),
        'AIC': result.aic,
        'BIC': result.bic
    }
    return summary

In [40]:
## alternative to function above

def multi_log_reg_sklearn(df, dependent_variable):
    # Convert to Pandas DataFrame if it's a Spark DataFrame
    if not isinstance(df, pd.DataFrame):
        df = df.toPandas()

    # Convert data to numeric, handling errors
    df = df.apply(pd.to_numeric, errors='coerce')

    # Splitting the dataset into features and target variable
    X = df.drop(dependent_variable, axis=1)
    y = df[dependent_variable]

    # Create the model with multinomial option
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

    # Fit the model
    model.fit(X, y)

    # You can add additional code to print or return the model summary, coefficients, etc.
    coefficients = model.coef_
    intercept = model.intercept_

    summary = {
        'coefficients': coefficients,
        'intercept': intercept
        # You can add more metrics as needed
    }

    return summary

# Solver: The 'lbfgs' solver is used for multinomial logistic regression. You can experiment with other solvers like 'newton-cg', 'sag', and 'saga'.
# Maximum Iterations: max_iter is set to 1000. Depending on your data, you might need to increase this number if the algorithm does not converge.
# Return Values: This function returns the coefficients and intercept of the model. You can modify it to return additional metrics or information as needed.
# Model Evaluation: Additional code can be added for model evaluation, such as generating a classification report or confusion matrix.

In [41]:
def combine_columns(df, col1, col2):
    # Example combination logic: taking the average of two columns
    return (col(col1) + col(col2)) / 2