In [1]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


file_path = "MachineLearinningDataSet.csv"  
df = pd.read_csv(file_path, index_col=0)  


variables = ['Surface area (sq. km)', 'Agricultural land (sq. km)', 'Land area (sq. km)', 'Forest area (sq. km)', 'Agricultural methane emissions (thousand metric tons of CO2 equivalent)', 'Rural population', 'Arable land (hectares)', 'Agriculture, forestry, and fishing, value added (current US$)', 'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)', 'Arable land (% of land area)', 'Average precipitation in depth (mm per year)', 'Agricultural land (% of land area)', 'Land under cereal production (hectares)']
target_column = 'Cereal production (metric tons)'


# Calculate VIF 
X_selected = df[variables]
X_selected = add_constant(X_selected) 

vif_data = pd.DataFrame()
vif_data["Variable"] = X_selected.columns
vif_data["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]

print("\nVariance Inflation Factors:")
print(vif_data.sort_values('VIF', ascending=False))



Variance Inflation Factors:
                                             Variable         VIF
1                               Surface area (sq. km)  926.460388
3                                  Land area (sq. km)  921.652463
0                                               const   13.950538
13            Land under cereal production (hectares)   12.613350
7                              Arable land (hectares)   11.534940
9   Agricultural nitrous oxide emissions (thousand...    9.698696
5   Agricultural methane emissions (thousand metri...    9.284582
2                          Agricultural land (sq. km)    6.019943
6                                    Rural population    3.598688
8   Agriculture, forestry, and fishing, value adde...    2.499671
10                       Arable land (% of land area)    2.462800
12                 Agricultural land (% of land area)    2.362758
4                                Forest area (sq. km)    2.296228
11       Average precipitation in depth (mm per

In [2]:
# Remove 'Surface area (sq. km)' 
if 'Surface area (sq. km)' in variables:
    variables.remove('Surface area (sq. km)')
    print("\n'Surface area (sq. km)' removed.")

# Re-Calculate VIF (excluding 'Surface area (sq. km)')
X_selected = df[variables]
X_selected = add_constant(X_selected)  # Add constant term to dataframe

vif_data = pd.DataFrame()
vif_data["Variable"] = X_selected.columns
vif_data["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]

print("\nVariance Inflation Factors (after removing 'Surface area (sq. km)'):")
print(vif_data.sort_values('VIF', ascending=False))

# Identify features with high VIF (e.g., VIF > 5)
high_vif_features = vif_data[vif_data['VIF'] > 5]['Variable'].tolist()
if 'const' in high_vif_features:
    high_vif_features.remove('const')  

print("\nFeatures with high VIF (> 5):")
print(high_vif_features)



'Surface area (sq. km)' removed.

Variance Inflation Factors (after removing 'Surface area (sq. km)'):
                                             Variable        VIF
0                                               const  13.877116
12            Land under cereal production (hectares)  12.611774
6                              Arable land (hectares)  11.516046
8   Agricultural nitrous oxide emissions (thousand...   9.693730
4   Agricultural methane emissions (thousand metri...   9.279304
2                                  Land area (sq. km)   7.112401
1                          Agricultural land (sq. km)   6.013953
5                                    Rural population   3.594188
7   Agriculture, forestry, and fishing, value adde...   2.491698
9                        Arable land (% of land area)   2.462475
11                 Agricultural land (% of land area)   2.354741
3                                Forest area (sq. km)   2.239440
10       Average precipitation in depth (mm per yea

In [3]:
# Remove 'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)'
if 'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)' in variables:
    variables.remove('Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)')
    print("\n'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)' removed from selected features.")

# Calculate VIF for selected features (excluding 'Surface area (sq. km)' and 'Agricultural nitrous oxide emissions')
X_selected = df[variables]
X_selected = add_constant(X_selected)  # Add constant term to dataframe

vif_data = pd.DataFrame()
vif_data["Variable"] = X_selected.columns
vif_data["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]

print("\nVariance Inflation Factors (after removing 'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)'):")
print(vif_data.sort_values('VIF', ascending=False))

# Identify features with high VIF (e.g., VIF > 5)
high_vif_features = vif_data[vif_data['VIF'] > 5]['Variable'].tolist()
if 'const' in high_vif_features:
    high_vif_features.remove('const')  

print("\nFeatures with high VIF (> 5):")
print(high_vif_features)



'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)' removed from selected features.

Variance Inflation Factors (after removing 'Agricultural nitrous oxide emissions (thousand metric tons of CO2 equivalent)'):
                                             Variable        VIF
0                                               const  13.379096
11            Land under cereal production (hectares)  12.610243
6                              Arable land (hectares)  11.240952
2                                  Land area (sq. km)   7.098922
1                          Agricultural land (sq. km)   6.008371
5                                    Rural population   3.423741
4   Agricultural methane emissions (thousand metri...   2.895913
8                        Arable land (% of land area)   2.462348
7   Agriculture, forestry, and fishing, value adde...   2.358572
10                 Agricultural land (% of land area)   2.354704
3                                Forest area (

In [4]:
# Remove 'Arable land (hectares)'
if 'Arable land (hectares)' in variables:
    variables.remove('Arable land (hectares)')
    print("\n'Arable land (hectares)' removed from selected features.")

# Calculate VIF for selected features (excluding 'Surface area (sq. km)', 'Agricultural nitrous oxide emissions', and 'Arable land (hectares)')
X_selected = df[variables]
X_selected = add_constant(X_selected)  # Add constant term to dataframe

vif_data = pd.DataFrame()
vif_data["Variable"] = X_selected.columns
vif_data["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]

print("\nVariance Inflation Factors (after removing 'Arable land (hectares)'):")
print(vif_data.sort_values('VIF', ascending=False))

# Identify features with high VIF (e.g., VIF > 5)
high_vif_features = vif_data[vif_data['VIF'] > 5]['Variable'].tolist()
if 'const' in high_vif_features:
    high_vif_features.remove('const')  

print("\nFeatures with high VIF (> 5):")
print(high_vif_features)



'Arable land (hectares)' removed from selected features.

Variance Inflation Factors (after removing 'Arable land (hectares)'):
                                             Variable        VIF
0                                               const  13.376962
2                                  Land area (sq. km)   7.053345
1                          Agricultural land (sq. km)   5.786443
10            Land under cereal production (hectares)   4.496713
5                                    Rural population   3.188949
4   Agricultural methane emissions (thousand metri...   2.875748
7                        Arable land (% of land area)   2.405569
9                  Agricultural land (% of land area)   2.345233
6   Agriculture, forestry, and fishing, value adde...   2.340639
3                                Forest area (sq. km)   2.152282
8        Average precipitation in depth (mm per year)   1.493583

Features with high VIF (> 5):
['Agricultural land (sq. km)', 'Land area (sq. km)']


In [5]:
# Remove 'Land area (sq. km)'
if 'Land area (sq. km)' in variables:
    variables.remove('Land area (sq. km)')
    print("\n'Land area (sq. km)' removed from selected features.")

# Calculate VIF for selected features
X_selected = df[variables]
X_selected = add_constant(X_selected)  

vif_data = pd.DataFrame()
vif_data["Variable"] = X_selected.columns
vif_data["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]

print("\nVariance Inflation Factors (after removing 'Land area (sq. km)'):")
print(vif_data.sort_values('VIF', ascending=False))

# Identify features with high VIF (e.g., VIF > 5)
high_vif_features = vif_data[vif_data['VIF'] > 5]['Variable'].tolist()
if 'const' in high_vif_features:
    high_vif_features.remove('const')  

print("\nFeatures with high VIF (> 5):")
print(high_vif_features)

print("\nFinal selected features:")
print([feature for feature in variables if feature != 'const'])



'Land area (sq. km)' removed from selected features.

Variance Inflation Factors (after removing 'Land area (sq. km)'):
                                            Variable        VIF
0                                              const  10.154430
9            Land under cereal production (hectares)   4.474723
4                                   Rural population   3.160876
3  Agricultural methane emissions (thousand metri...   2.873508
1                         Agricultural land (sq. km)   2.488640
6                       Arable land (% of land area)   2.384182
5  Agriculture, forestry, and fishing, value adde...   2.332830
8                 Agricultural land (% of land area)   2.129998
2                               Forest area (sq. km)   1.476139
7       Average precipitation in depth (mm per year)   1.309663

Features with high VIF (> 5):
[]

Final selected features:
['Agricultural land (sq. km)', 'Forest area (sq. km)', 'Agricultural methane emissions (thousand metric tons of CO2