In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score


# Loading the Dataset

In [2]:
df = pd.read_csv('dataset.csv')
print(df.head())

     Country  Year  Hospital_density           gdp  Communicable_disease  \
0  Australia  2000             6.648  4.158450e+11                  35.6   
1  Australia  2001             6.656  3.793580e+11                  53.3   
2  Australia  2002             6.586  3.955730e+11                  32.2   
3  Australia  2003             6.521  4.674980e+11                  29.3   
4  Australia  2004             6.477  6.143260e+11                  46.4   

   Hospital_Expenditure  Total_Health_Expenditure  Injury Count  \
0                 2.987                     7.594        138.30   
1                 3.024                     7.678        141.76   
2                 3.123                     7.876        142.60   
3                 3.157                     7.881        143.47   
4                 3.250                     8.087        144.10   

   life_expectancy  Mortality  Population  
0             79.2      676.8  19028802.0  
1             79.6      667.5  19274701.0  
2       

# Checking for missing values

In [3]:
df.isna().sum()

Country                     0
Year                        0
Hospital_density            0
gdp                         0
Communicable_disease        0
Hospital_Expenditure        0
Total_Health_Expenditure    0
Injury Count                0
life_expectancy             0
Mortality                   0
Population                  0
dtype: int64

#### Checking the correlation of the target variable with the features to make sure they are independent of one another.

In [4]:
features = df[['Year', 'gdp', 'Communicable_disease', 'Hospital_Expenditure', 'Total_Health_Expenditure', 'Injury Count', 'life_expectancy', 'Mortality', 'Population']]
target_variable = df['Hospital_density']

# Calculating the correlation
cor_of_var = features.corrwith(target_variable)

In [5]:
cor_of_var

Year                       -0.042157
gdp                        -0.021798
Communicable_disease        0.089968
Hospital_Expenditure        0.500842
Total_Health_Expenditure   -0.069142
Injury Count               -0.081538
life_expectancy            -0.167741
Mortality                  -0.202806
Population                  0.074867
dtype: float64

As seen above,there is no strong correlation between the features and the target variable

# Separating the features and target variable

In [6]:
X = df[['Year', 'gdp', 'Communicable_disease', 'Hospital_Expenditure', 'Total_Health_Expenditure', 'Injury Count', 'life_expectancy', 'Mortality', 'Population']]
y = df['Hospital_density']

# Splitting the Data into Training and Testing Sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Training the model

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)


# Making Predictions

In [9]:
y_pred = model.predict(X_test)

# Evaluating the model

In [10]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [11]:
f'R-squared: {r2}'

'R-squared: 0.6134290803082998'

In [12]:
f"Mean Squared Error: {mse}"

'Mean Squared Error: 5.41548821223706'

##### Applying cross-validation to the MSE

In [13]:
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Converting negative MSE to positive and calculate the average
average_mse = -cv_scores.mean()

print(f'Cross-Validated MSE: {average_mse}')

Cross-Validated MSE: 17.855392150029378


#### Making the hospital density prediction for a particular year

In [14]:
# Let's say we want to predict the hospital density for the year 2025
# future_data = pd.DataFrame({
#     'Year': [2025],
#     'gdp': [your_value],
#     'Communicable_disease': [your_value],
#     'Hospital_Expenditure': [your_value],
#     'Total_Health_Expenditure': [your_value],
#     'Injury Count': [your_value],
#     'life_expectancy': [your_value],
#     'Mortality': [your_value],
#     'Population': [your_value]
# })

# future_prediction = model.predict(future_data)
# f'Predicted Hospital Density for 2025: {future_prediction[0]}'