In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import f_regression
from scipy import stats

data = pd.read_csv('Final Features.csv')
data = data.rename(columns={'Life Expectancy at Birth, both sexes (years)': 'target'})

# Preprocessing
for feature in data.columns:
    if data[feature].dtype != 'object':
        if abs(data[feature].skew()) > 1:
            # Use IQR to manage non-normal distribution
            Q1 = data[feature].quantile(0.25)
            Q3 = data[feature].quantile(0.75)
            IQR = Q3 - Q1
            lower_limit = Q1 - 1.5 * IQR
            upper_limit = Q3 + 1.5 * IQR
            data[feature] = np.where((data[feature] < lower_limit) | (data[feature] > upper_limit), np.nan, data[feature])
        else:
            # Use standard deviation to manage normal distribution
            mean = data[feature].mean()
            std = data[feature].std()
            lower_limit = mean - 3 * std
            upper_limit = mean + 3 * std
            data[feature] = np.where((data[feature] < lower_limit) | (data[feature] > upper_limit), np.nan, data[feature])
    
# Handle missing data using KNN imputation
imputer = KNNImputer(n_neighbors=5)
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Remove features with high correlation
corr_matrix = data.corr()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(abs(upper_tri[column]) > 0.90)]
data.drop(to_drop, axis=1, inplace=True)

# Split data into input and output variables
X = data.drop('target', axis=1)
y = data['target']

clf = LinearRegression()
pipeline = make_pipeline(RobustScaler(), clf)
pipeline.fit(X, y)

# K-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []

# Define the hyperparameters to tune
param_grid = {
    'normalize': [True, False],
    'fit_intercept': [True, False]
}

# Find the best hyperparameters using GridSearchCV
grid_search = GridSearchCV(LinearRegression(), param_grid, cv=kf, scoring='r2')
grid_search.fit(X, y)

# Print the best hyperparameters and R-squared score
print("Best parameters: ", grid_search.best_params_)
print("Best R-squared score: ", grid_search.best_score_)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model with the best hyperparameters
best_model = LinearRegression(normalize=grid_search.best_params_['normalize'], fit_intercept=grid_search.best_params_['fit_intercept'])
best_model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared on test set: ", r2)

In [None]:
# Load the new data and preprocess it
new_data = pd.read_csv('2.csv')
ls = X_train.columns.values.tolist()
new_data = new_data.filter(ls)
for feature in new_data.columns:
    if new_data[feature].dtype != 'object':
        if abs(new_data[feature].skew()) > 1:
            Q1 = new_data[feature].quantile(0.25)
            Q3 = new_data[feature].quantile(0.75)
            IQR = Q3 - Q1
            lower_limit = Q1 - 1.5 * IQR
            upper_limit = Q3 + 1.5 * IQR
            new_data[feature] = np.where((new_data[feature] < lower_limit) | (new_data[feature] > upper_limit), np.nan, new_data[feature])
        else:
            mean = new_data[feature].mean()
            std = new_data[feature].std()
            lower_limit = mean - 3 * std
            upper_limit = mean + 3 * std
            new_data[feature] = np.where((new_data[feature] < lower_limit) | (new_data[feature] > upper_limit), np.nan, new_data[feature])
    else:
        new_data[feature] = pd.factorize(new_data[feature])[0]

# Handle missing data using KNN imputation
new_data = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)

In [None]:
# Make predictions on the new data
y_new_pred = best_model.predict(new_data)