<h2>Data Cleaning and Wrangling

In [16]:
# Initializing spark session 
from pyspark.sql import SparkSession 
spark = SparkSession.builder.appName("Test").getOrCreate() 

# Import relevant packages 
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import * 
from pyspark.sql import DataFrame 
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
import builtins

In [17]:
# define data path
data_path = "/Users/lanzia/Desktop/Study/FS 2025/01.1st Semester/Introduction to Data in Business Analytics/Project/Data/Saved_Tables/"

In [18]:
# read the data
AI_usage = pd.read_excel(data_path + "01_AI_usage_wth_fin_data.xlsx", engine="openpyxl")
AI_acceptance = pd.read_excel(data_path + "01_AI_acceptance_wth_fin_data.xlsx", engine="openpyxl")

AI_usage = spark.createDataFrame(AI_usage)
AI_acceptance = spark.createDataFrame(AI_acceptance)

In [19]:
# define the columns needed to be cleaned later
ranges = [(202319, 202326), (202401, 202426), (202501, 202518)]
numeric_fields = [StructField(str(i), DoubleType(), True) 
                  for start, end in ranges for i in range(start, end + 1)
                  ]
cols_to_clean = ['avg_' + str(i) for start, end in ranges for i in range(start, end + 1)]

In [20]:
# define financial data columns
fin_data_cols = ['avg_Closing_Price',
                 'avg_Market_Value',
                 'avg_Sales',
                 'avg_1_Year_Growth_Total_Sales',
                 'avg_FE_Sales_Mean_FY1_Roll',
                 'avg_EBITDA_Oper_Income',
                 'avg_FE_Ebitda_Mean_FY1_Roll',
                 'avg_Earns_Per_Share',
                 'avg_FE_Eps_Mean_FY1_Roll',
                 'avg_FE_Val_Roe_Mean_FY1_Roll',
                 'avg_Return_on_Avg_Total_Assets',
                 'avg_FE_Val_Pe_Mean_FY1_Roll',
                 'avg_Resrch_and_Develop_Expense'
                 ]

all_columns = cols_to_clean + fin_data_cols

In [21]:
# round the digits of usage rate
for c in all_columns:
    if c in AI_usage.columns:
        AI_usage = (AI_usage
                    .withColumn(c, F.when(F.isnan(F.col(c)), None).otherwise(F.col(c)))
                    .withColumn(c, round(c, 1))
                    )

AI_usage.show()

+--------------------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------------+----------------+---------+-----------------+-----------------------------+--------------------------+----------------------+---------------------------+-------------------+------------------------+----------------------------+------------------------------+---------------------------+------------------------------+
|              Sector|Question ID|avg_202319|avg_202320|av

In [22]:
# round the digits of acceptance rate
for c in all_columns:
    if c in AI_acceptance.columns:
        AI_acceptance = (AI_acceptance
                    .withColumn(c, F.when(F.isnan(F.col(c)), None).otherwise(F.col(c)))
                    .withColumn(c, round(c, 1))
                    )

AI_acceptance.show()

+--------------------+-----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------------+----------------+---------+-----------------+-----------------------------+--------------------------+----------------------+---------------------------+-------------------+------------------------+----------------------------+------------------------------+---------------------------+------------------------------+
|              Sector|Question ID|avg_202319|avg_202320|av

In [23]:
# create average usage rate
AI_usage_wth_avg = (AI_usage
                    .withColumn('two_year_avg', round((builtins.sum((F.col(c) for c in cols_to_clean), F.lit(0.0)) / len(cols_to_clean)),1))
                    .select('Sector', 'two_year_avg', *cols_to_clean, *fin_data_cols)
                    .fillna(0.0)
                    )

AI_usage_wth_avg.show()

+--------------------+------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------------+----------------+---------+-----------------------------+--------------------------+----------------------+---------------------------+-------------------+------------------------+----------------------------+------------------------------+---------------------------+------------------------------+
|              Sector|two_year_avg|avg_202319|avg_202320|avg_202321|avg_202

In [24]:
# create average acceptance rate
AI_acceptance_wth_avg = (AI_acceptance
                    .withColumn('two_year_avg', round((builtins.sum((F.col(c) for c in cols_to_clean), F.lit(0.0)) / len(cols_to_clean)),1))
                    .select('Sector', 'two_year_avg', *cols_to_clean, *fin_data_cols)
                    .fillna(0.0)
                    )

AI_acceptance_wth_avg.show()

+--------------------+------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------------+----------------+---------+-----------------------------+--------------------------+----------------------+---------------------------+-------------------+------------------------+----------------------------+------------------------------+---------------------------+------------------------------+
|              Sector|two_year_avg|avg_202319|avg_202320|avg_202321|avg_202

In [25]:
# save the tables
AI_usage_wth_avg_pd = AI_usage_wth_avg.toPandas()
AI_acceptance_wth_avg_pd = AI_acceptance_wth_avg.toPandas()

In [26]:
# adjust for one missing value: Utilities sector did not disclose R&D
mean_value_usage = AI_usage_wth_avg_pd.loc[AI_usage_wth_avg_pd['avg_Resrch_and_Develop_Expense'] != 0, 'avg_Resrch_and_Develop_Expense'].mean()
AI_usage_wth_avg_pd.loc[(AI_usage_wth_avg_pd['Sector'] == 'Utilities') & (AI_usage_wth_avg_pd['avg_Resrch_and_Develop_Expense'] == 0.0),
                        'avg_Resrch_and_Develop_Expense'] = mean_value_usage

AI_usage_wth_avg_pd.loc[AI_usage_wth_avg_pd['Sector'] == 'Utilities', 'avg_Resrch_and_Develop_Expense'] = \
    AI_usage_wth_avg_pd.loc[AI_usage_wth_avg_pd['Sector'] == 'Utilities', 'avg_Resrch_and_Develop_Expense'].round(1)

In [27]:
AI_usage_wth_avg_pd

Unnamed: 0,Sector,two_year_avg,avg_202319,avg_202320,avg_202321,avg_202322,avg_202323,avg_202324,avg_202325,avg_202326,...,avg_1_Year_Growth_Total_Sales,avg_FE_Sales_Mean_FY1_Roll,avg_EBITDA_Oper_Income,avg_FE_Ebitda_Mean_FY1_Roll,avg_Earns_Per_Share,avg_FE_Eps_Mean_FY1_Roll,avg_FE_Val_Roe_Mean_FY1_Roll,avg_Return_on_Avg_Total_Assets,avg_FE_Val_Pe_Mean_FY1_Roll,avg_Resrch_and_Develop_Expense
0,Business Services,10.7,5.9,7.9,6.2,6.3,6.8,8.1,8.1,8.2,...,9.3,11367.0,2871.9,3258.0,6.6,8.4,32.0,11.5,48.0,328.9
1,Consumer Cyclicals,3.9,2.6,2.3,2.6,2.7,3.4,3.4,3.3,3.4,...,3.0,42980.8,5537.3,5129.3,32.4,29.6,32.8,11.8,27.8,1545.9
2,Consumer Non-Cyclicals,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,...,1.0,76896.6,8872.7,10220.3,3.9,5.5,58.0,8.4,19.7,2864.8
3,Consumer Services,5.5,3.3,3.3,3.8,3.4,4.1,3.9,4.4,4.0,...,10.6,22256.1,4115.3,4762.3,10.1,13.4,31.6,7.6,35.8,833.6
4,Energy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,59302.6,11270.3,12851.2,5.0,5.4,15.7,5.8,16.9,471.3
5,Finance,8.8,5.6,4.7,5.7,6.5,6.3,6.1,7.2,7.3,...,11.2,23605.9,5258.4,4592.8,7.8,9.1,17.6,4.3,29.4,2018.7
6,Healthcare,6.4,3.7,3.7,3.5,4.1,4.7,4.4,4.7,4.8,...,6.0,55474.1,5777.9,7014.4,7.8,11.8,39.9,6.5,19.7,2825.1
7,Industrials,2.7,1.5,1.5,1.8,1.5,1.9,2.0,2.0,2.1,...,5.1,23594.0,3733.1,4089.5,10.2,11.3,26.0,9.1,26.4,677.5
8,Technology,19.7,13.9,13.8,11.7,13.8,16.6,14.2,18.7,22.0,...,11.7,38362.8,12271.2,15816.3,6.6,9.3,47.7,12.5,34.5,4589.1
9,Utilities,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.6,14953.4,5303.0,5863.3,4.0,4.5,13.1,3.0,19.6,1795.0


In [28]:
# adjust for one value
mean_value_acceptance = AI_acceptance_wth_avg_pd.loc[AI_acceptance_wth_avg_pd['avg_Resrch_and_Develop_Expense'] != 0, 'avg_Resrch_and_Develop_Expense'].mean()
AI_acceptance_wth_avg_pd.loc[(AI_acceptance_wth_avg_pd['Sector'] == 'Utilities') & (AI_acceptance_wth_avg_pd['avg_Resrch_and_Develop_Expense'] == 0.0),
                        'avg_Resrch_and_Develop_Expense'] = mean_value_acceptance

AI_acceptance_wth_avg_pd.loc[AI_acceptance_wth_avg_pd['Sector'] == 'Utilities', 'avg_Resrch_and_Develop_Expense'] = \
    AI_acceptance_wth_avg_pd.loc[AI_acceptance_wth_avg_pd['Sector'] == 'Utilities', 'avg_Resrch_and_Develop_Expense'].round(1)

In [29]:
AI_acceptance_wth_avg_pd

Unnamed: 0,Sector,two_year_avg,avg_202319,avg_202320,avg_202321,avg_202322,avg_202323,avg_202324,avg_202325,avg_202326,...,avg_1_Year_Growth_Total_Sales,avg_FE_Sales_Mean_FY1_Roll,avg_EBITDA_Oper_Income,avg_FE_Ebitda_Mean_FY1_Roll,avg_Earns_Per_Share,avg_FE_Eps_Mean_FY1_Roll,avg_FE_Val_Roe_Mean_FY1_Roll,avg_Return_on_Avg_Total_Assets,avg_FE_Val_Pe_Mean_FY1_Roll,avg_Resrch_and_Develop_Expense
0,Business Services,14.1,10.1,10.8,10.4,10.7,10.5,11.7,10.6,10.7,...,9.3,11367.0,2871.9,3258.0,6.6,8.4,32.0,11.5,48.0,328.9
1,Consumer Cyclicals,5.5,4.0,3.9,4.6,4.7,5.3,4.7,3.7,4.7,...,3.0,42980.8,5537.3,5129.3,32.4,29.6,32.8,11.8,27.8,1545.9
2,Consumer Non-Cyclicals,0.0,0.0,0.0,5.4,0.0,0.0,2.5,0.0,0.0,...,1.0,76896.6,8872.7,10220.3,3.9,5.5,58.0,8.4,19.7,2864.8
3,Consumer Services,7.7,5.4,4.9,6.0,5.3,6.4,5.8,5.6,5.5,...,10.6,22256.1,4115.3,4762.3,10.1,13.4,31.6,7.6,35.8,833.6
4,Energy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,59302.6,11270.3,12851.2,5.0,5.4,15.7,5.8,16.9,471.3
5,Finance,12.9,10.2,10.3,10.2,10.5,10.0,11.0,9.1,10.3,...,11.2,23605.9,5258.4,4592.8,7.8,9.1,17.6,4.3,29.4,2018.7
6,Healthcare,9.4,5.8,5.9,6.3,6.4,7.2,6.9,6.9,6.8,...,6.0,55474.1,5777.9,7014.4,7.8,11.8,39.9,6.5,19.7,2825.1
7,Industrials,4.3,3.0,3.0,2.9,2.8,2.8,4.0,3.3,3.3,...,5.1,23594.0,3733.1,4089.5,10.2,11.3,26.0,9.1,26.4,677.5
8,Technology,25.4,21.2,18.6,19.3,21.8,23.9,21.8,21.5,24.0,...,11.7,38362.8,12271.2,15816.3,6.6,9.3,47.7,12.5,34.5,4589.1
9,Utilities,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.6,14953.4,5303.0,5863.3,4.0,4.5,13.1,3.0,19.6,1795.0


In [30]:
# save the updated tables:
AI_usage_wth_avg_pd.to_excel(data_path + "02_AI_usage_upd.xlsx", index=False)
AI_acceptance_wth_avg_pd.to_excel(data_path + "02_AI_acceptance_upd.xlsx", index=False)