In [33]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import Normalizer, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer  
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [4]:
def load_df(file):
    """Loads the Dataset"""

    df = pd.read_csv(file, header=None, na_values='?')
    df = df.iloc[:, 5:]
    df.set_axis([i for i in range(123)], axis="columns", inplace=True)
    imp = SimpleImputer(strategy="mean")
    df = pd.DataFrame(imp.fit_transform(df))
    
    return df

In [5]:
df = load_df('communities.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,113,114,115,116,117,118,119,120,121,122
0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.02,0.12,0.45,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.67
2,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.01,0.21,0.02,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.43
3,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.02,0.39,0.28,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.12
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.04,0.09,0.02,0.163103,0.076708,0.698589,0.440439,0.0,0.195078,0.03


K Fold cross validation

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])

In [18]:
np.ones(df.shape[0])
df.shape[0]

1994

In [36]:
k = 5
kf = KFold(n_splits=k)

mse_values = []
score_values = []
it = 1
for train_index, test_index in kf.split(df):
    df_train = df.iloc[train_index]
    df_test = df.iloc[test_index]
    
    X_train = df_train.iloc[:, :-1]
    y_train = df_train.iloc[:, -1:]

    X_test = df_test.iloc[:, :-1]
    y_test = df_test.iloc[:, -1:]

    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    score = r2_score(y_test, y_pred)
    

    print(f'mse for fold {it}: ', mse)
    print(f'R2 for fold {it}: ', score)
    print()

    mse_values.append(mse)
    score_values.append(score)
    it += 1


avg_mse = np.mean(mse_values)
avg_r2 = np.mean(score_values)

print("Average MSE:", avg_mse)
print("Average R2:", avg_r2)

mse for fold 1:  0.02026701900612295
R2 for fold 1:  0.6126658537433746

mse for fold 2:  0.024893793345130047
R2 for fold 2:  0.6086467428530515

mse for fold 3:  0.017155916075585812
R2 for fold 3:  0.7024591548826215

mse for fold 4:  0.016230526419174066
R2 for fold 4:  0.6685735292031314

mse for fold 5:  0.866840126850282
R2 for fold 5:  -17.056490304180016

Average MSE: 0.18907747633925898
Average R2: -2.8928290046995673
