In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('Developed_nations.csv')
data.head()

Unnamed: 0,Country,Year,Happiness Score,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,CPI %,Unemployment Rate,region,Development_Status
0,Argentina,2010,6.441,10.066,0.927,66.3,0.73,-0.129,0.855,0.0,7.71,South America,Developed
1,Argentina,2011,6.776,10.112,0.889,66.42,0.816,-0.178,0.755,0.0,7.18,South America,Developed
2,Argentina,2012,6.468,10.091,0.902,66.54,0.747,-0.151,0.817,0.0,7.22,South America,Developed
3,Argentina,2013,6.582,10.103,0.91,66.66,0.737,-0.134,0.823,0.0,7.1,South America,Developed
4,Argentina,2014,6.671,10.067,0.918,66.78,0.745,-0.168,0.854,0.0,7.27,South America,Developed


In [3]:
# Identify and handle missing values
data = data.replace('Null', np.nan)

In [4]:
data.dtypes


Country                              object
Year                                  int64
Happiness Score                     float64
Log GDP per capita                  float64
Social support                      float64
Healthy life expectancy at birth    float64
Freedom to make life choices        float64
Generosity                          float64
Perceptions of corruption           float64
CPI %                               float64
Unemployment Rate                   float64
region                               object
Development_Status                   object
dtype: object

In [5]:
# Create a dictionary to specify the columns and their new data types
column_data_types = {'Log GDP per capita': float, 'Social support': float, 'Healthy life expectancy at birth': float, 'Freedom to make life choices': float,
                     'Generosity': float, 'Perceptions of corruption': float, 'CPI %': float,'Unemployment Rate': float}

In [6]:
# Use the dictionary to change the data types of multiple columns at once
df = data.astype(column_data_types)


In [7]:
# Define your independent variables (features) and the dependent variable (happiness scores)
X = data[['Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'CPI %', 'Unemployment Rate']]


y = data['Happiness Score']


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Standardize the features (optional, but can be helpful)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [11]:
print("Intercept: ", model.intercept_)
print("Coefficients:")
list(zip(X, model.coef_))

Intercept:  6.5638190709046444
Coefficients:


[('Log GDP per capita', 0.09725067105707524),
 ('Social support', 0.22989481811653611),
 ('Healthy life expectancy at birth', 0.22385563339809206),
 ('Freedom to make life choices', 0.09501987409569261),
 ('Generosity', 0.1934337912752156),
 ('Perceptions of corruption', -0.17615328599824617),
 ('CPI %', -0.03369280626368953),
 ('Unemployment Rate', -0.11420103628261372)]

In [12]:
#Prediction of test set
y_pred = model.predict(X_test)
#Predicted values
print("Prediction for test set: {}".format(y_pred))

Prediction for test set: [7.16941936 7.20231142 6.0104075  6.81220057 7.01989435 4.72335709
 7.2042049  6.87679935 7.33034956 6.44702902 5.93964791 5.47827974
 7.38363173 6.31923064 5.74497654 7.2008884  6.31753235 6.85944079
 6.02685087 7.02533014 6.74912845 7.15085444 7.07427363 7.17374456
 6.51756951 6.49212874 6.23823471 6.4958494  6.22128749 7.24435232
 6.66195921 7.46058717 5.38545989 5.60804562 6.19559768 6.10264688
 7.1152091  6.19075679 6.19876941 5.67316556 6.850107   5.78670279
 5.52972624 7.27000205 6.9328598  6.83294449 4.59238834 6.7052849
 7.30001628 6.97831219 7.41129212 6.42386335 7.18189527 7.09714499
 7.28044719 5.88625348 6.09577019 7.00033915 6.28806768 4.99608747
 6.5095767  7.24061478 5.19480894 6.10668477 7.14450269 7.4063683
 6.9404341  6.20166657 6.19972607 6.13305055 5.86311476 6.31525401
 7.10967152 7.04633738 7.0799333  7.33458758 6.19735371 5.87721162
 6.13440892 7.38225183 5.00539045 6.19263658 6.80912245 4.84848111
 6.6006827  7.35222894 6.80411612 5.885

In [13]:
#Actual value and the predicted value
#model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
#model_diff.head(30)

In [14]:
# Create a copy of the original data
df_happiness_predicted =data.copy()

# Add a column with the predicted values
df_happiness_predicted['Predicted Happiness Score'] = model.predict(X)

# Display sample data
print(df_happiness_predicted)


           Country  Year  Happiness Score  Log GDP per capita  Social support  \
0        Argentina  2010            6.441              10.066           0.927   
1        Argentina  2011            6.776              10.112           0.889   
2        Argentina  2012            6.468              10.091           0.902   
3        Argentina  2013            6.582              10.103           0.910   
4        Argentina  2014            6.671              10.067           0.918   
..             ...   ...              ...                 ...             ...   
507  United States  2018            6.883              11.024           0.904   
508  United States  2019            6.944              11.043           0.917   
509  United States  2020            7.028              11.005           0.937   
510  United States  2021            6.959              11.061           0.920   
511  United States  2022            6.693              11.079           0.900   

     Healthy life expectanc



In [15]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [16]:
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 0.13565691586692505
R-squared: 0.7638657227464156


In [17]:
# Display the coefficients of the model to identify influential variables
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print("Influential variables:\n", coefficients)

Influential variables:
                             Feature  Coefficient
0                Log GDP per capita     0.097251
1                    Social support     0.229895
2  Healthy life expectancy at birth     0.223856
3      Freedom to make life choices     0.095020
4                        Generosity     0.193434
5         Perceptions of corruption    -0.176153
6                             CPI %    -0.033693
7                 Unemployment Rate    -0.114201
