In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence

# Load the data
df = pd.read_csv('flow.csv')  # Make sure 'flow.csv' is in the working directory

# Remove the 'id' column
df = df.drop(columns=['id'])

# Define predictors and target
X = df.drop(columns=['flow'])
y = df['flow']

# Add constant for intercept
X_with_const = sm.add_constant(X)

# Fit linear regression model
model = sm.OLS(y, X_with_const).fit()

# Get influence measures
influence = OLSInfluence(model)

# Extract diagnostics
leverage = influence.hat_matrix_diag
studentized = influence.resid_studentized_external
dffits_vals = influence.dffits[0]
cooks_d = influence.cooks_distance[0]

# Combine diagnostics into a DataFrame
diagnostics = pd.DataFrame({
    'Leverage': leverage,
    'Studentized': studentized,
    'DFFITS': dffits_vals,
    'Cooks_Distance': cooks_d
})

# Thresholds
n = len(df)
p = X.shape[1]
leverage_threshold = 2 * (p + 1) / n
studentized_threshold = 2
cooks_threshold = 1

# Identify points exceeding thresholds
high_leverage = diagnostics.index[diagnostics['Leverage'] > leverage_threshold].tolist()
high_studentized = diagnostics.index[diagnostics['Studentized'].abs() > studentized_threshold].tolist()
high_cooks = diagnostics.index[diagnostics['Cooks_Distance'] > cooks_threshold].tolist()

# Union of all flagged indices
suspicious_indices = sorted(set(high_leverage + high_studentized + high_cooks))
flagged_rows = diagnostics.loc[suspicious_indices]

# Print summary
print("Thresholds:")
print(f"  Leverage > {leverage_threshold:.4f}")
print(f"  |Studentized Residual| > {studentized_threshold}")
print(f"  Cook's Distance > {cooks_threshold}\n")

print("Flagged Points:")
print(flagged_rows)

# Optional: preview of all diagnostics
print("\nFirst 10 rows of diagnostics:")
print(diagnostics.head(10))


Thresholds:
  Leverage > 0.0686
  |Studentized Residual| > 2
  Cook's Distance > 1

Flagged Points:
     Leverage  Studentized    DFFITS  Cooks_Distance
3    0.042423    -3.299669 -0.694520        0.075949
7    0.075275    -1.881333 -0.536766        0.047309
21   0.079909     0.148221  0.043681        0.000320
25   0.081299    -0.400593 -0.119168        0.002379
30   0.100308     0.824091  0.275166        0.012643
35   0.037802    -2.273651 -0.450657        0.033034
46   0.071479    -0.729753 -0.202474        0.006852
54   0.085756     0.509955  0.156183        0.004083
65   0.083208     2.427973  0.731462        0.086663
68   0.039954    -3.303441 -0.673912        0.071499
70   0.020308    -2.250250 -0.323984        0.017084
73   0.150386    -0.457556 -0.192503        0.006205
77   0.054574     2.834389  0.680988        0.074203
85   0.129382    -0.051723 -0.019939        0.000067
92   0.088681    -2.192532 -0.683955        0.076248
122  0.070125     0.826973  0.227098        0.008612