In [17]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from scipy.stats import ttest_ind

In [2]:
df = pd.read_csv('Project3.csv')

In [4]:
df.head()

Unnamed: 0,Company,Year,Revenue,NetIncome,TotalAssets,TotalLiabilities,CurrentRatio
0,Gamma Ltd,2021,13851084,2414553,88130995,2127521,1.27
1,Beta Inc,2019,55017531,8184290,60537038,32978206,2.31
2,Gamma Ltd,2019,88505817,8887291,169739863,47960672,2.32
3,Alpha Corp,2021,51942854,941461,40082404,123357354,1.33
4,Beta Inc,2020,94993736,4275812,144433035,57153878,1.16


In [5]:
df.shape

(50, 7)

In [6]:
df.isnull().sum()

Company             0
Year                0
Revenue             0
NetIncome           0
TotalAssets         0
TotalLiabilities    0
CurrentRatio        0
dtype: int64

In [7]:
# Normalize revenue, net income, total assets, total liabilities for each year
columns_to_normalize = ['Revenue', 'NetIncome', 'TotalAssets', 'TotalLiabilities']

df_normalized = df.copy()

for col in columns_to_normalize:
    df_normalized[col + '_norm'] = df.groupby('Year')[col].transform(
        lambda x: (x - x.min()) / (x.max() - x.min())
    )

In [9]:
df_normalized.head()

Unnamed: 0,Company,Year,Revenue,NetIncome,TotalAssets,TotalLiabilities,CurrentRatio,Revenue_norm,NetIncome_norm,TotalAssets_norm,TotalLiabilities_norm
0,Gamma Ltd,2021,13851084,2414553,88130995,2127521,1.27,0.046255,0.227532,0.493737,0.0
1,Beta Inc,2019,55017531,8184290,60537038,32978206,2.31,0.478279,0.918773,0.313233,0.205482
2,Gamma Ltd,2019,88505817,8887291,169739863,47960672,2.32,1.0,1.0,0.990448,0.311948
3,Alpha Corp,2021,51942854,941461,40082404,123357354,1.33,0.48335,0.069917,0.168903,0.944731
4,Beta Inc,2020,94993736,4275812,144433035,57153878,1.16,1.0,0.680579,0.699893,0.352784


In [11]:
# Prepare your features and target
X = df[['Revenue', 'TotalAssets', 'TotalLiabilities', 'CurrentRatio']]
y = df['NetIncome']



In [12]:
# Add constant term for intercept
X = sm.add_constant(X)


In [15]:
# Fit regression model
model = sm.OLS(y, X).fit()

In [16]:
model.summary()

0,1,2,3
Dep. Variable:,NetIncome,R-squared:,0.103
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,1.295
Date:,"Sat, 05 Jul 2025",Prob (F-statistic):,0.286
Time:,00:51:28,Log-Likelihood:,-816.64
No. Observations:,50,AIC:,1643.0
Df Residuals:,45,BIC:,1653.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.712e+06,1.72e+06,0.995,0.325,-1.75e+06,5.18e+06
Revenue,0.0308,0.017,1.817,0.076,-0.003,0.065
TotalAssets,0.0057,0.008,0.675,0.503,-0.011,0.023
TotalLiabilities,-0.0037,0.012,-0.303,0.763,-0.028,0.021
CurrentRatio,4.829e+05,6.7e+05,0.721,0.475,-8.66e+05,1.83e+06

0,1,2,3
Omnibus:,6.809,Durbin-Watson:,2.05
Prob(Omnibus):,0.033,Jarque-Bera (JB):,2.973
Skew:,0.307,Prob(JB):,0.226
Kurtosis:,1.975,Cond. No.,512000000.0


In [18]:
#Hypothesis testing
revenue_alpha = df[df['Company'] == 'Alpha Corp']['Revenue']
revenue_beta = df[df['Company'] == 'Beta Inc']['Revenue']

t_stat, p_value = ttest_ind(revenue_alpha, revenue_beta, equal_var=False)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("Significant difference in revenue between Alpha Corp and Beta Inc")
else:
    print("No significant difference in revenue between Alpha Corp and Beta Inc")

T-statistic: -1.0850156974479308, P-value: 0.28656167316550574
No significant difference in revenue between Alpha Corp and Beta Inc


In [19]:
df['Cost'] = df['Revenue'] - df['NetIncome']

In [20]:
df.head()

Unnamed: 0,Company,Year,Revenue,NetIncome,TotalAssets,TotalLiabilities,CurrentRatio,Cost
0,Gamma Ltd,2021,13851084,2414553,88130995,2127521,1.27,11436531
1,Beta Inc,2019,55017531,8184290,60537038,32978206,2.31,46833241
2,Gamma Ltd,2019,88505817,8887291,169739863,47960672,2.32,79618526
3,Alpha Corp,2021,51942854,941461,40082404,123357354,1.33,51001393
4,Beta Inc,2020,94993736,4275812,144433035,57153878,1.16,90717924


In [21]:
# t-tests to compare performance 
net_income_alpha = df[df['Company'] == 'Alpha Corp']['NetIncome']
net_income_beta = df[df['Company'] == 'Beta Inc']['NetIncome']

t_stat, p_value = ttest_ind(net_income_alpha, net_income_beta, equal_var=False)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("Significant difference in net income between Alpha Corp and Beta Inc")
else:
    print("No significant difference in net income between Alpha Corp and Beta Inc")

T-statistic: -1.0565821565191842, P-value: 0.30014118972195486
No significant difference in net income between Alpha Corp and Beta Inc


In [22]:
#compare across different years:
net_income_2021 = df[df['Year'] == 2021]['NetIncome']
net_income_2023 = df[df['Year'] == 2023]['NetIncome']

t_stat, p_value = ttest_ind(net_income_2021, net_income_2023, equal_var=False)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("Significant difference in net income between 2021 and 2023")
else:
    print("No significant difference in net income between 2021 and 2023")

T-statistic: 0.48406653006200023, P-value: 0.6343224754824511
No significant difference in net income between 2021 and 2023


The null hypothesis for this test is that the mean revenues of Alpha Corp and Beta Inc are the same.
Since p = 0.287 > 0.05, we accept the null hypothesis.
There is no significant difference in average revenue between Alpha Corp and Beta Inc.
Similarly, mean net incomes of Alpha Corp and Beta Inc are equal and average net income in 2021 and 2023 are equal.
no major differences seen based on year or average net income as well. 