In [1]:
!pip install handyspark
!pip install gcsfs
!pip install openpyxl

Collecting handyspark
  Downloading handyspark-0.2.2a1-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting findspark (from handyspark)
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading handyspark-0.2.2a1-py2.py3-none-any.whl (39 kB)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark, handyspark
Successfully installed findspark-2.0.1 handyspark-0.2.2a1


[0mCollecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
[0m

In [2]:
spark

In [2]:
import numpy as np
import gcsfs
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
np.bool = np.bool_

In [4]:
from handyspark import *

# Cleaning PPP data

In [5]:
#Get data from the csv's that were downloaded
ppp_df = spark.read.csv("gs://ppp-loans-bucket/landing/ppp_loans*", header=True, inferSchema=True)

                                                                                

In [6]:
ppp_df.printSchema()

root
 |-- LoanNumber: long (nullable = true)
 |-- DateApproved: string (nullable = true)
 |-- SBAOfficeCode: integer (nullable = true)
 |-- ProcessingMethod: string (nullable = true)
 |-- BorrowerName: string (nullable = true)
 |-- BorrowerAddress: string (nullable = true)
 |-- BorrowerCity: string (nullable = true)
 |-- BorrowerState: string (nullable = true)
 |-- BorrowerZip: string (nullable = true)
 |-- LoanStatusDate: string (nullable = true)
 |-- LoanStatus: string (nullable = true)
 |-- Term: string (nullable = true)
 |-- SBAGuarantyPercentage: string (nullable = true)
 |-- InitialApprovalAmount: string (nullable = true)
 |-- CurrentApprovalAmount: string (nullable = true)
 |-- UndisbursedAmount: string (nullable = true)
 |-- FranchiseName: string (nullable = true)
 |-- ServicingLenderLocationID: string (nullable = true)
 |-- ServicingLenderName: string (nullable = true)
 |-- ServicingLenderAddress: string (nullable = true)
 |-- ServicingLenderCity: string (nullable = true)
 |--

In [7]:
num_records = ppp_df.count()
print("This dataset contains {} records.".format(num_records))



This dataset contains 11468210 records.


                                                                                

In [8]:
#Count nulls in each row, so we can eventually drop them
hsdf = HandyFrame(ppp_df)
hsdf.isnull()

24/11/27 21:23:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

LoanNumber                            0
DateApproved                          0
SBAOfficeCode                        28
ProcessingMethod                      0
BorrowerName                          0
BorrowerAddress                      32
BorrowerCity                         28
BorrowerState                       165
BorrowerZip                         197
LoanStatusDate                   169089
LoanStatus                            0
Term                                  0
SBAGuarantyPercentage                 0
InitialApprovalAmount                 0
CurrentApprovalAmount                 0
UndisbursedAmount                  1143
FranchiseName                  11318087
ServicingLenderLocationID            61
ServicingLenderName                  31
ServicingLenderAddress               28
ServicingLenderCity                  28
ServicingLenderState                 29
ServicingLenderZip                   28
RuralUrbanIndicator                   0
HubzoneIndicator                      0


In [9]:
#Drop nulls, and columns with mostly nulls.
#The exception to dropping is NonProfit and FranchiseName. These are to be used in dimensions later despite their nulls.
to_drop_columns = ["MORTGAGE_INTEREST_PROCEED","RENT_PROCEED","REFINANCE_EIDL_PROCEED",
               "HEALTH_CARE_PROCEED","DEBT_INTEREST_PROCEED","UTILITIES_PROCEED"]
ignore_columns = ["NonProfit","FranchiseName"]

In [18]:
#function takes in a spark dataframe, then drops nulls in ALL rows. Accepts a list of columns to skip in this process
#This function is meant for dfs with a large amount of columns with small amount of nulls in each column.
#
#BEFORE USING THIS FUNCTION, DROP COLUMNS THAT HAVE A HIGH PERCENTAGE OF NULLS
#OTHERWISE THE WHOLE DF WILL PRACTICALLY BE DROPPED.

def drop_nulls_in_rows(df, list_of_columns_to_ignore=None):
    #get a extremely small sample just so we can find the rows we are dealing with using pandas
    sample_sdf = df.limit(1)
    pandas_df = sample_sdf.toPandas()
 
    #Now we get our list of columns that we will drop nulls in
    columns = pandas_df.columns
    
    #If any value of the list_of_columns_to_ignore is in the columns, we take that column out.
    if list_of_columns_to_ignore is not None:
        # Filter out the columns that are to be ignored
        columns = [col for col in pandas_df.columns if col not in list_of_columns_to_ignore]
    
    
    #With the columns extracted, loop through every column and filter it so we dont get nulls.
    for column in columns:
        print("Removing nulls in "+column+"...")
        df = df.filter(column+" is not NULL")
    
    return df

In [11]:
#function that drops a list of columns in a pyspark df
def drop_columns(df, list_of_columns):
    for column in list_of_columns:
        print("Removing column "+column+"...")
        df = df.drop(column)
    
    return df

In [12]:
#Dropping columns
ppp_df = drop_columns(ppp_df,to_drop_columns)

Removing column MORTGAGE_INTEREST_PROCEED...
Removing column RENT_PROCEED...
Removing column REFINANCE_EIDL_PROCEED...
Removing column HEALTH_CARE_PROCEED...
Removing column DEBT_INTEREST_PROCEED...
Removing column UTILITIES_PROCEED...


In [19]:
#Dropping all rows with a null, bar a few from the list that was created earlier
ppp_df = drop_nulls_in_rows(ppp_df,ignore_columns)

Removing nulls in LoanNumber...
Removing nulls in DateApproved...
Removing nulls in SBAOfficeCode...
Removing nulls in ProcessingMethod...
Removing nulls in BorrowerName...
Removing nulls in BorrowerAddress...
Removing nulls in BorrowerCity...
Removing nulls in BorrowerState...
Removing nulls in BorrowerZip...
Removing nulls in LoanStatusDate...
Removing nulls in LoanStatus...
Removing nulls in Term...
Removing nulls in SBAGuarantyPercentage...
Removing nulls in InitialApprovalAmount...
Removing nulls in CurrentApprovalAmount...
Removing nulls in UndisbursedAmount...
Removing nulls in ServicingLenderLocationID...
Removing nulls in ServicingLenderName...
Removing nulls in ServicingLenderAddress...
Removing nulls in ServicingLenderCity...
Removing nulls in ServicingLenderState...
Removing nulls in ServicingLenderZip...
Removing nulls in RuralUrbanIndicator...
Removing nulls in HubzoneIndicator...
Removing nulls in LMIIndicator...
Removing nulls in BusinessAgeDescription...
Removing nulls

In [20]:
#Turn date columns into datatime datatypes.
#Date Columns to transform: ("ForgivenessDate","DateApproved","LoanStatusDate")
from pyspark.sql.functions import to_date

# Convert the date columns of type string to type datetime
ppp_df = ppp_df.withColumn("forgiveness_date", to_date(ppp_df["ForgivenessDate"], "MM/dd/yyyy"))
ppp_df.select("ForgivenessDate","forgiveness_date").show()
ppp_df = ppp_df.drop("ForgivenessDate")

ppp_df = ppp_df.withColumn("date_approved", to_date(ppp_df["DateApproved"], "MM/dd/yyyy"))
ppp_df.select("DateApproved","date_approved").show()
ppp_df = ppp_df.drop("DateApproved")

ppp_df = ppp_df.withColumn("loan_status_date", to_date(ppp_df["LoanStatusDate"], "MM/dd/yyyy"))
ppp_df.select("LoanStatusDate","loan_status_date").show()
ppp_df = ppp_df.drop("LoanStatusDate")

+---------------+----------------+
|ForgivenessDate|forgiveness_date|
+---------------+----------------+
|     06/11/2021|      2021-06-11|
|     07/13/2021|      2021-07-13|
|     09/13/2022|      2022-09-13|
|     05/18/2021|      2021-05-18|
|     07/07/2021|      2021-07-07|
|     06/10/2021|      2021-06-10|
|     07/22/2021|      2021-07-22|
|     07/28/2021|      2021-07-28|
|     11/09/2021|      2021-11-09|
|     06/11/2021|      2021-06-11|
|     06/14/2021|      2021-06-14|
|     06/22/2021|      2021-06-22|
|     01/12/2022|      2022-01-12|
|     07/27/2021|      2021-07-27|
|     01/17/2023|      2023-01-17|
|     06/28/2021|      2021-06-28|
|     06/11/2021|      2021-06-11|
|     06/11/2021|      2021-06-11|
|     06/11/2021|      2021-06-11|
|     06/11/2021|      2021-06-11|
+---------------+----------------+
only showing top 20 rows

+------------+-------------+
|DateApproved|date_approved|
+------------+-------------+
|  08/08/2020|   2020-08-08|
|  04/14/2020|   2

DataFrame[LoanNumber: bigint, DateApproved: string, SBAOfficeCode: int, ProcessingMethod: string, BorrowerName: string, BorrowerAddress: string, BorrowerCity: string, BorrowerState: string, BorrowerZip: string, LoanStatus: string, Term: string, SBAGuarantyPercentage: string, InitialApprovalAmount: string, CurrentApprovalAmount: string, UndisbursedAmount: string, FranchiseName: string, ServicingLenderLocationID: string, ServicingLenderName: string, ServicingLenderAddress: string, ServicingLenderCity: string, ServicingLenderState: string, ServicingLenderZip: string, RuralUrbanIndicator: string, HubzoneIndicator: string, LMIIndicator: string, BusinessAgeDescription: string, ProjectCity: string, ProjectCountyName: string, ProjectState: string, ProjectZip: string, CD: string, JobsReported: string, NAICSCode: string, Race: string, Ethnicity: string, PAYROLL_PROCEED: string, BusinessType: string, OriginatingLenderLocationID: string, OriginatingLender: string, OriginatingLenderCity: string, Or

In [21]:
#Count nulls again to see if all nulls are dropped. Theoretically, there should be no nulls.
hsdf = HandyFrame(ppp_df)
hsdf.isnull()

                                                                                

LoanNumber                            0
DateApproved                          0
SBAOfficeCode                         0
ProcessingMethod                      0
BorrowerName                          0
BorrowerAddress                       0
BorrowerCity                          0
BorrowerState                         0
BorrowerZip                           0
LoanStatusDate                        0
LoanStatus                            0
Term                                  0
SBAGuarantyPercentage                 0
InitialApprovalAmount                 0
CurrentApprovalAmount                 0
UndisbursedAmount                     0
FranchiseName                  10258886
ServicingLenderLocationID             0
ServicingLenderName                   0
ServicingLenderAddress                0
ServicingLenderCity                   0
ServicingLenderState                  0
ServicingLenderZip                    0
RuralUrbanIndicator                   0
HubzoneIndicator                      0


In [22]:
#Count records to see how many records are dropped compared to before
new_num_records = ppp_df.count()
difference = num_records - new_num_records
print("Dropped "+str(difference)+" records. New count is "+str(new_num_records))
print("Roughly "+str(round(new_num_records/num_records,2))+"% of rows were dropped.")



Dropped 1062142 records. New count is 10406068
Roughly 0.91% of rows were dropped.


                                                                                

In [23]:
#With our cleaned data, we write it back to the cleaned folder
ppp_df.write.parquet("gs://ppp-loans-bucket/cleaned",mode="overwrite")

                                                                                

# Cleaning GPD data

In [24]:
fs = gcsfs.GCSFileSystem(project='cis4400-group-project')
with fs.open('ppp-loans-bucket/landing/gdp.csv') as f:
    gdp_df = pd.read_csv(f)
gdp_df

Unnamed: 0,GeoFIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,2017,2018,2019,2020,2021,2022
0,0,United States,,CAGDP1,1,...,Real GDP (thousands of chained 2017 dollars),Thousands of chained 2017 dollars,19612102000,20193896000,20692087000,20234074000,21407692000,21822037000
1,0,United States,,CAGDP1,2,...,Chain-type quantity indexes for real GDP,Quantity index,100,102.967,105.507,103.171,109.156,111.268
2,0,United States,,CAGDP1,3,...,Current-dollar GDP (thousands of current dolla...,Thousands of dollars,19612102000,20656516000,21521395000,21322950000,23594031000,25744108000
3,1000,Alabama,5.0,CAGDP1,1,...,Real GDP (thousands of chained 2017 dollars),Thousands of chained 2017 dollars,216615470,220808767,224944577,222081439,231892626,235807320
4,1000,Alabama,5.0,CAGDP1,2,...,Chain-type quantity indexes for real GDP,Quantity index,100,101.936,103.845,102.523,107.053,108.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9529,97000,Rocky Mountain,7.0,CAGDP1,2,...,Chain-type quantity indexes for real GDP,Quantity index,100,104.559,109.552,109.329,116.661,119.324
9530,97000,Rocky Mountain,7.0,CAGDP1,3,...,Current-dollar GDP (thousands of current dolla...,Thousands of dollars,681310123,730567674,776281078,781272363,880142487,974682556
9531,98000,Far West,8.0,CAGDP1,1,...,Real GDP (thousands of chained 2017 dollars),Thousands of chained 2017 dollars,3797440495,3956948041,4108525822,4048649569,4342903004,4385657757
9532,98000,Far West,8.0,CAGDP1,2,...,Chain-type quantity indexes for real GDP,Quantity index,100,104.2,108.192,106.615,114.364,115.49


In [25]:
gdp_df.isnull().sum()

GeoFIPS                   0
GeoName                   0
Region                    3
TableName                 0
LineCode                  0
IndustryClassification    0
Description               0
Unit                      0
2017                      0
2018                      0
2019                      0
2020                      0
2021                      0
2022                      0
dtype: int64

In [26]:
gdp_df.dropna(inplace=True)

In [27]:
# Initialize GCS file system
fs = gcsfs.GCSFileSystem(project='cis4400-group-project')

# Define the GCS path for saving the file
gcs_path = 'gs://ppp-loans-bucket/cleaned/GDP.csv'

# Save DataFrame to the "cleaned" folder in the GCS bucket
with fs.open(gcs_path, 'w') as f:
    gdp_df.to_csv(f, index=False)

print(f"DataFrame saved to {gcs_path}")

DataFrame saved to gs://ppp-loans-bucket/cleaned/GDP.csv


# Cleaning NAICS data

In [3]:
fs = gcsfs.GCSFileSystem(project='cis4400-group-project')
with fs.open('ppp-loans-bucket/landing/NAICS_2-6-digit_2022_Codes.xlsx') as f:
    naics_df = pd.read_excel(f)
naics_df

Unnamed: 0,Seq. No.,2022 NAICS US Code,2022 NAICS US Title,Unnamed: 3,Unnamed: 4
0,,,,,
1,1.0,11,"Agriculture, Forestry, Fishing and Hunting",,
2,2.0,111,Crop Production,,
3,3.0,1111,Oilseed and Grain Farming,,
4,4.0,11111,Soybean Farming,,
...,...,...,...,...,...
2121,2121.0,9281,National Security and International Affairs,,
2122,2122.0,92811,National Security,,
2123,2123.0,928110,National Security,,
2124,2124.0,92812,International Affairs,,


In [4]:
naics_df.isnull().sum()

Seq. No.                   1
2022 NAICS US   Code       1
2022 NAICS US Title        1
Unnamed: 3              2126
Unnamed: 4              2125
dtype: int64

In [5]:
# Dropping the "Unnamed: 2" column. It needs a bit of extra work to drop

# Check the columns to verify the exact name
print(naics_df.columns)

# Strip extra spaces from column names
naics_df.columns = naics_df.columns.str.strip()

# Drop unnecessary columns
naics_df.drop('Unnamed: 3', axis=1, errors='ignore', inplace=True)
naics_df.drop('Unnamed: 4', axis=1, errors='ignore', inplace=True)
naics_df.drop('Seq. No.', axis=1, inplace=True)
# Display the first few rows
naics_df.head()

Index(['Seq. No.', '2022 NAICS US   Code', '2022 NAICS US Title', 'Unnamed: 3',
       'Unnamed: 4'],
      dtype='object')


Unnamed: 0,2022 NAICS US Code,2022 NAICS US Title
0,,
1,11.0,"Agriculture, Forestry, Fishing and Hunting"
2,111.0,Crop Production
3,1111.0,Oilseed and Grain Farming
4,11111.0,Soybean Farming


In [6]:
#Drop null rows
naics_df.dropna(inplace=True)

In [33]:
def naics_create_industry_type_col(df):
    
    if '2022 NAICS US   Code' in df.columns:
        df.rename(columns={'2022 NAICS US   Code': '2022 NAICS US Code'}, inplace=True)
    
    # Step 1: Extract the 2-digit NAICS code
    df["2_digit_code"] = df["2022 NAICS US Code"].astype(str).str[:2]

    # Step 2: Create a mapping for 2-digit code to title
    mapping = df.loc[df["2022 NAICS US Code"].astype(str).str.len() == 2, 
                     ["2022 NAICS US Code", "2022 NAICS US Title"]]
    mapping = mapping.set_index("2022 NAICS US Code")["2022 NAICS US Title"].to_dict()

    # Step 3: Use the 2-digit code to map the industry type
    df["industry_type"] = df["2_digit_code"].astype(int).map(mapping)

    # Drop the helper column if no longer needed
    df = df.drop(columns=["2_digit_code"])
    
    return df

In [38]:
naics_df = naics_create_industry_type_col(naics_df)
naics_df

Unnamed: 0,2022 NAICS US Code,2022 NAICS US Title,industry_type
1,11,"Agriculture, Forestry, Fishing and Hunting","Agriculture, Forestry, Fishing and Hunting"
2,111,Crop Production,"Agriculture, Forestry, Fishing and Hunting"
3,1111,Oilseed and Grain Farming,"Agriculture, Forestry, Fishing and Hunting"
4,11111,Soybean Farming,"Agriculture, Forestry, Fishing and Hunting"
5,111110,Soybean Farming,"Agriculture, Forestry, Fishing and Hunting"
...,...,...,...
2121,9281,National Security and International Affairs,Public Administration
2122,92811,National Security,Public Administration
2123,928110,National Security,Public Administration
2124,92812,International Affairs,Public Administration


In [39]:
# Initialize GCS file system
fs = gcsfs.GCSFileSystem(project='cis4400-group-project')

# Define the GCS path for saving the file
gcs_path = 'gs://ppp-loans-bucket/cleaned/NAICS.csv'

# Save DataFrame to the "cleaned" folder in the GCS bucket
with fs.open(gcs_path, 'w') as f:
    naics_df.to_csv(f, index=False)

print(f"DataFrame saved to {gcs_path}")

DataFrame saved to gs://ppp-loans-bucket/cleaned/NAICS.csv
