In [11]:
import pandas as pd

# Loading our dataset
data = pd.read_csv('global_unemployment_data_clean.csv')

# Group the data by country_name
grouped_data = data.groupby('country_name')

#each country group and print the data for each country
for country, country_data in grouped_data:
    print("Country:", country)
    
    # Filter the data to include only columns from 2014 to 2024
    country_data_filtered = country_data[['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']]
    
    # Print the filtered data
    print(country_data_filtered)
    print("\n")  # cleanliness!


Country: Afghanistan
   2014  2015  2016  2017  2018  2019  2020  2021  2022  2023  2024
0  13.3  16.0  18.6  21.1  20.6  20.2  21.2  21.6  30.6  32.2  33.3
1   8.6   9.0   9.5   9.9  11.2  12.6  14.1  14.4  23.8  26.2  28.3
2  10.3  11.6  12.8  14.0  14.7  15.4  16.8  17.1  26.7  29.2  31.0
3   9.2  11.5  13.8  16.0  15.2  14.4  14.5  15.1  16.7  18.5  19.8
4   6.5   6.9   7.3   7.7   7.8   8.0   8.7   9.2  11.4  12.3  13.1
5   7.3   8.4   9.4  10.4  10.1   9.9  10.5  11.0  13.2  14.4  15.3


Country: Albania
    2014  2015  2016  2017  2018  2019  2020  2021  2022  2023  2024
6   32.6  40.3  34.1  27.4  25.8  26.0  29.8  28.7  27.0  25.8  25.2
7   13.6  14.5  12.1  10.9  10.3   9.7  10.7  10.7  10.1   9.7   9.5
8   15.5  17.1  14.5  12.5  11.9  11.3  12.5  12.3  11.5  10.9  10.7
9   43.6  39.6  37.6  34.0  29.9  28.0  31.4  30.4  29.0  27.7  27.3
10  16.3  13.9  13.3  12.0  10.2   9.4  10.8  10.7   9.9   9.5   9.3
11  19.9  17.2  16.1  14.5  12.6  11.6  13.1  12.8  11.8  11.2  11.0



In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Extract features (years 2014-2023) and target (year 2024) columns
X = data[['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']]
y = data['2024']

# Drop rows with missing values
data = data.dropna()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.11385538386933304


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Loading our dataset
data = pd.read_csv('global_unemployment_data_clean.csv')

# Extract features (X) and target variable (y)
X = data.drop(columns=['country_name', 'sex', 'age_categories'])
y = data['2014']  # Assuming you want to predict unemployment for 2014

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit SimpleImputer to impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train_imputed, y_train)

# Impute missing values in the testing set
X_test_imputed = imputer.transform(X_test)

# Make predictions on the testing set
y_pred = model.predict(X_test_imputed)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Predict unemployment rates for the next ten years
future_years = pd.DataFrame(columns=X.columns, index=[0])  # Use the same columns as in the training data
future_years_imputed = imputer.transform(future_years)
future_predictions = model.predict(future_years_imputed)

# Print the predicted unemployment rates
print("Predictions for the next ten years (2025 to 2034):")
print(future_predictions)


Mean Squared Error: 8.470512825989428e-28
Predictions for the next ten years (2025 to 2034):
[11.56835722]


In [54]:
# Select only the columns containing the unemployment rates
unemployment_columns = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']

# Calculate the mean unemployment rate for each country
average_unemployment_rate_by_country = data[unemployment_columns].mean(axis=1)

# Add the average unemployment rate as a new column in the DataFrame
data['average_unemployment_rate'] = average_unemployment_rate_by_country

# Display the DataFrame with the average unemployment rate for each country
print(data[['country_name', 'average_unemployment_rate']])


     country_name  average_unemployment_rate
0     Afghanistan                  22.609091
1     Afghanistan                  15.236364
2     Afghanistan                  18.145455
3     Afghanistan                  14.972727
4     Afghanistan                   8.990909
...           ...                        ...
1129     Zimbabwe                   5.981818
1130     Zimbabwe                   7.618182
1131     Zimbabwe                  10.890909
1132     Zimbabwe                   5.890909
1133     Zimbabwe                   7.336364

[1134 rows x 2 columns]


In [43]:

print(data.columns)

Index(['country_name', 'sex', 'age_categories', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022', '2023', '2024', 'continent'],
      dtype='object')


In [53]:
print(data.columns)



Index(['country_name', 'sex', 'age_categories', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022', '2023', '2024'],
      dtype='object')
