In [1]:
# Data Handling
import pandas as pd
import numpy as np

# Visulaization
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('datasets/Fish.csv')
df

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [4]:
dff = df.drop('Species', axis = 1)

    Assumption : 
    linearity
    normality
    no multicollinarity
    homoscedasticiy
    autocorrelation

In [8]:
# linearity : correlation with dependent variable
from scipy.stats import pearsonr

correlation_df = pd.DataFrame(columns = ['Feature', 'Correlation', 'P-Value', 'Significance'])

X = dff.drop('Weight', axis = 1)
y = dff['Weight']

for feature in X.columns:
    # Calculate the Pearson correlation coefficient and its p-value
    correlation, p_value = pearsonr(X[feature], df['Weight'])

    # Determine whether to accept or reject the linearity assumption based on correlation and p-value
    if p_value < 0.05:#abs(correlation) >= 0.5 and p_value < 0.05
        acceptance_status = 'Accept'
    else:
        acceptance_status = 'Reject'

    # Append the results to the correlation DataFrame
    correlation_df = correlation_df.append({'Feature': feature, 'Correlation': correlation, 'P-Value': p_value, 'Significance': acceptance_status}, ignore_index=True)

# Display the correlation DataFrame
print(correlation_df)

   Feature  Correlation       P-Value Significance
0  Length1     0.915712  4.749620e-64       Accept
1  Length2     0.918618  3.395113e-65       Accept
2  Length3     0.923044  5.023191e-67       Accept
3   Height     0.724345  3.842342e-27       Accept
4    Width     0.886507  2.038195e-54       Accept


In [9]:
# normality
from scipy.stats import anderson

anderson_stat, anderson_critical_values, anderson_significance_levels = anderson(dff['Weight'])
print(f"Anderson-Darling Test\n")
print(f"Statistic: {anderson_stat}")
print(f"Anderson_Critical_Values : {anderson_critical_values}")
print("Anderson_Significance_Levels:", anderson_significance_levels)

# Perform the Anderson-Darling test
for crit_val, sig_level in zip(anderson_critical_values, anderson_significance_levels):
    if anderson_stat > crit_val:
        print(f"At {sig_level}% significance level : Data not normally distributed")
    else:
        print(f"At {sig_level}% significance level : Data normally distributed")

Anderson-Darling Test

Statistic: 5.634848337876235
Anderson_Critical_Values : [0.562 0.641 0.768 0.896 1.066]
Anderson_Significance_Levels: [15.  10.   5.   2.5  1. ]
At 15.0% significance level : Data not normally distributed
At 10.0% significance level : Data not normally distributed
At 5.0% significance level : Data not normally distributed
At 2.5% significance level : Data not normally distributed
At 1.0% significance level : Data not normally distributed


In [10]:
# no multicollinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor

#Independent Variable
X = dff.drop('Weight', axis = 1)

x1_vif = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

vif_df = pd.DataFrame({'Variable': (X.columns), 'VIF': x1_vif})
vif_df

Unnamed: 0,Variable,VIF
0,Length1,12782.53583
1,Length2,16598.744318
2,Length3,3380.815907
3,Height,76.064364
4,Width,92.658672


In [11]:
X.corr()

Unnamed: 0,Length1,Length2,Length3,Height,Width
Length1,1.0,0.999517,0.992031,0.625378,0.86705
Length2,0.999517,1.0,0.994103,0.640441,0.873547
Length3,0.992031,0.994103,1.0,0.703409,0.87852
Height,0.625378,0.640441,0.703409,1.0,0.792881
Width,0.86705,0.873547,0.87852,0.792881,1.0


In [13]:
import statsmodels.api as sm

vif_col = list(vif_df.Variable)

X1 = df[vif_col]
y1 = dff['Weight']

model1 = sm.OLS(y1,X1).fit()
model1.summary()

0,1,2,3
Dep. Variable:,Weight,R-squared (uncentered):,0.854
Model:,OLS,Adj. R-squared (uncentered):,0.849
Method:,Least Squares,F-statistic:,179.7
Date:,"Fri, 29 Sep 2023",Prob (F-statistic):,2.2399999999999998e-62
Time:,10:40:11,Log-Likelihood:,-1071.7
No. Observations:,159,AIC:,2153.0
Df Residuals:,154,BIC:,2169.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Length1,202.0690,66.391,3.044,0.003,70.914,333.224
Length2,-89.5220,69.967,-1.279,0.203,-227.742,48.698
Length3,-82.6718,28.784,-2.872,0.005,-139.534,-25.809
Height,55.7740,14.470,3.854,0.000,27.188,84.360
Width,-51.1129,33.577,-1.522,0.130,-117.444,15.218

0,1,2,3
Omnibus:,59.832,Durbin-Watson:,0.423
Prob(Omnibus):,0.0,Jarque-Bera (JB):,143.316
Skew:,1.628,Prob(JB):,7.570000000000001e-32
Kurtosis:,6.322,Cond. No.,310.0


In [16]:
X = dff.drop(['Weight'], axis = 1)
y = dff['Weight']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 0)

from sklearn.linear_model import LinearRegression

linear_model = LinearRegression().fit(X_train, y_train)

ytest_predicted = linear_model.predict(X_test)

from sklearn.metrics import r2_score

ytrain_predicted = linear_model.predict(X_train)
train_accuracy = r2_score(y_train, ytrain_predicted)

test_accuracy = r2_score(y_test, ytest_predicted)

print(f"train_accuracy = {train_accuracy}")
print(f"test_accuracy = {test_accuracy}")

train_accuracy = 0.8865221536639928
test_accuracy = 0.8632643733814438


In [17]:
X = dff.drop(['Weight','Length2','Width'], axis = 1)
y = dff['Weight']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 0)

from sklearn.linear_model import LinearRegression

linear_model = LinearRegression().fit(X_train, y_train)

ytest_predicted = linear_model.predict(X_test)

from sklearn.metrics import r2_score

ytrain_predicted = linear_model.predict(X_train)
train_accuracy = r2_score(y_train, ytrain_predicted)

test_accuracy = r2_score(y_test, ytest_predicted)

print(f"train_accuracy = {train_accuracy}")
print(f"test_accuracy = {test_accuracy}")

train_accuracy = 0.885279754118748
test_accuracy = 0.8638707806596198


In [20]:
from scipy.stats import yeojohnson

transformed_data, lambda_value = yeojohnson(dff['Weight'])

print(f'Lambda Value: {lambda_value} \nTransformed data:{transformed_data}')

Lambda Value: 0.3341248882092676 
Transformed data:[15.76489425 16.92939186 18.01330476 18.47645667 19.72330258 20.0702035
 20.89479698 18.99592806 20.0702035  20.89479698 20.48971343 20.89479698
 20.89479698 18.01330476 22.39240686 22.39240686 23.73202554 23.73202554
 22.53276132 23.07936282 22.0345816  23.53957527 22.67159438 23.47480298
 23.73202554 24.04677183 23.98440667 23.90918717 25.52080609 27.11018306
 26.28393982 26.65107497 26.33695009 26.85686135 26.59918122  7.35763261
  9.38322251  9.89362596 10.36663864 11.86651837  0.         11.44435739
 11.86651837 13.00790364 12.82888668 13.35442932 12.64573945 13.35442932
 13.65424858 13.38828517 14.61252341 14.00667081 16.92939186 16.50886434
 18.99592806 16.461011   16.461011   17.28888421 21.51581436 24.94972317
 27.11018306  8.49404579  8.8270208  10.51711678 11.86651837 13.00790364
 12.64573945 13.68690368 12.82888668 14.61252341 16.53270351 17.15557033
  2.71362959  6.63351976  7.35763261  8.24899393  9.44201758 10.99604779
 

In [21]:
dff['transform_weight'] = transformed_data
dff

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,transform_weight
0,242.0,23.2,25.4,30.0,11.5200,4.0200,15.764894
1,290.0,24.0,26.3,31.2,12.4800,4.3056,16.929392
2,340.0,23.9,26.5,31.1,12.3778,4.6961,18.013305
3,363.0,26.3,29.0,33.5,12.7300,4.4555,18.476457
4,430.0,26.5,29.0,34.0,12.4440,5.1340,19.723303
...,...,...,...,...,...,...,...
154,12.2,11.5,12.2,13.4,2.0904,1.3936,4.094766
155,13.4,11.7,12.4,13.5,2.4300,1.2690,4.303848
156,12.2,12.1,13.0,13.8,2.2770,1.2558,4.094766
157,19.7,13.2,14.3,15.2,2.8728,2.0672,5.244497


In [22]:
X = dff.drop(['Weight','transform_weight'], axis = 1)
y = dff['transform_weight']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 0)

from sklearn.linear_model import LinearRegression

linear_model = LinearRegression().fit(X_train, y_train)

ytest_predicted = linear_model.predict(X_test)

from sklearn.metrics import r2_score

ytrain_predicted = linear_model.predict(X_train)
train_accuracy = r2_score(y_train, ytrain_predicted)

test_accuracy = r2_score(y_test, ytest_predicted)

print(f"train_accuracy = {train_accuracy}")
print(f"test_accuracy = {test_accuracy}")

train_accuracy = 0.9921439137538045
test_accuracy = 0.9211020140574373


In [24]:
X1 = dff.drop(['Weight','transform_weight'], axis = 1)
y1 = dff['transform_weight']

model1 = sm.OLS(y1,X1).fit()
model1.summary()

0,1,2,3
Dep. Variable:,transform_weight,R-squared (uncentered):,0.994
Model:,OLS,Adj. R-squared (uncentered):,0.993
Method:,Least Squares,F-statistic:,4745.0
Date:,"Fri, 29 Sep 2023",Prob (F-statistic):,1.1200000000000001e-166
Time:,10:56:21,Log-Likelihood:,-286.96
No. Observations:,159,AIC:,583.9
Df Residuals:,154,BIC:,599.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Length1,0.8425,0.477,1.765,0.080,-0.100,1.785
Length2,-0.0034,0.503,-0.007,0.995,-0.997,0.990
Length3,-0.4732,0.207,-2.287,0.024,-0.882,-0.064
Height,0.6447,0.104,6.197,0.000,0.439,0.850
Width,0.9220,0.241,3.820,0.000,0.445,1.399

0,1,2,3
Omnibus:,163.066,Durbin-Watson:,1.208
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6221.469
Skew:,-3.51,Prob(JB):,0.0
Kurtosis:,32.83,Cond. No.,310.0


In [25]:
X = dff.drop(['Weight','Length2','transform_weight'], axis = 1)
y = dff['transform_weight']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 0)

from sklearn.linear_model import LinearRegression

linear_model = LinearRegression().fit(X_train, y_train)

ytest_predicted = linear_model.predict(X_test)

from sklearn.metrics import r2_score

ytrain_predicted = linear_model.predict(X_train)
train_accuracy = r2_score(y_train, ytrain_predicted)

test_accuracy = r2_score(y_test, ytest_predicted)

print(f"train_accuracy = {train_accuracy}")
print(f"test_accuracy = {test_accuracy}")

train_accuracy = 0.9919740894991007
test_accuracy = 0.9207647194007211
