**This module aims to predict the socio-economic status (SES) of individuals based on various features such as GDP per capita, years of education, population share, and categorical information like country and a unique identifier ('wbid').**

**Features and Target**
**Features (X):**

**gdppc: GDP per capita**
**yrseduc: Years of education**
**popshare: Population share**
**wbid: A unique identifier (assumed to represent individuals or groups)**
**country: The country of the individual or group**
**Target (y):SES: Socio-economic status categorized into three classes: Low, Medium, and High**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score,roc_auc_score
from sklearn.svm import SVC
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
file_path = r'C:\Users\Amasaman\Desktop\Blossom\GLOB.SES.csv'
data = pd.read_csv(file_path, encoding='latin1')


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2086 entries, 0 to 2085
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   unid      2086 non-null   int64  
 1   wbid      2086 non-null   object 
 2   country   2086 non-null   object 
 3   year      2086 non-null   int64  
 4   SES       2086 non-null   float64
 5   gdppc     2086 non-null   float64
 6   yrseduc   1036 non-null   float64
 7   popshare  2086 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 130.5+ KB


In [4]:
isnull = data.isnull().sum()
isnull

unid           0
wbid           0
country        0
year           0
SES            0
gdppc          0
yrseduc     1050
popshare       0
dtype: int64

In [5]:
# Preprocessing
# Impute missing values in 'yrseduc' column with the mean
imputer = SimpleImputer(strategy='mean')
data['yrseduc'] = imputer.fit_transform(data[['yrseduc']])

In [6]:
#Checking the data set for corrections made
isnull = data.isnull().sum()
isnull

unid        0
wbid        0
country     0
year        0
SES         0
gdppc       0
yrseduc     0
popshare    0
dtype: int64

In [7]:
# Check for 'wbid' column existence
print("Columns in the dataset:", data.columns)

Columns in the dataset: Index(['unid', 'wbid', 'country', 'year', 'SES', 'gdppc', 'yrseduc',
       'popshare'],
      dtype='object')


In [8]:
# Feature selection
features = ['gdppc', 'yrseduc', 'popshare', 'wbid', 'country']
X = data[features]
y = data['SES']

In [9]:
# Ensure target variable is numeric before categorization
y = pd.to_numeric(y, errors='coerce')

In [10]:
# Drop rows where the target variable is missing
X = X.dropna(subset=['yrseduc'])
y = y[X.index]

In [11]:
# Categorize SES into classes (e.g., low, medium, high)
bins = [0, 33, 66, 99]
labels = ['Low', 'Medium', 'High']
y = pd.cut(y, bins=bins, labels=labels)

In [12]:
# Drop any remaining missing values in target variable
X = X.dropna()
y = y.dropna()

In [13]:
# Ensure the indices of X and y are the same
X = X.loc[y.index]

In [14]:
# Check columns in X before splitting
print("Columns in X before splitting:", X.columns)

Columns in X before splitting: Index(['gdppc', 'yrseduc', 'popshare', 'wbid', 'country'], dtype='object')


In [15]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Check columns in X_train after splitting
print("Columns in X_train after splitting:", X_train.columns)


Columns in X_train after splitting: Index(['gdppc', 'yrseduc', 'popshare', 'wbid', 'country'], dtype='object')


In [17]:
# Preprocessing for numerical data
numeric_features = ['gdppc', 'yrseduc', 'popshare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [18]:
# Preprocessing for categorical data
categorical_features = ['wbid', 'country']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [19]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [20]:
# Model building pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [21]:
# Define the model
model = RandomForestClassifier(random_state=42)


In [22]:
# Model building pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [23]:
# Hyperparameter tuning (example with grid search)
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')


In [24]:
# Fit the grid search
grid_search.fit(X_train, y_train)

In [25]:
# Best model
best_model = grid_search.best_estimator_

In [26]:
# Model evaluation
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8776978417266187
Classification Report:
               precision    recall  f1-score   support

        High       0.91      0.95      0.93       171
         Low       0.91      0.80      0.85       109
      Medium       0.82      0.85      0.83       137

    accuracy                           0.88       417
   macro avg       0.88      0.87      0.87       417
weighted avg       0.88      0.88      0.88       417



In [27]:
# Interpret results
importances = best_model.named_steps['classifier'].feature_importances_
feature_names = numeric_features + list(best_model.named_steps['preprocessor'].transformers_[1][1]['onehot'].get_feature_names_out(categorical_features))
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

                    Feature  Importance
0                     gdppc    0.229295
1                   yrseduc    0.108786
2                  popshare    0.104419
148                wbid_YEM    0.005794
53                 wbid_GMB    0.005544
..                      ...         ...
249     country_New Zealand    0.000077
279          country_Sweden    0.000072
106                wbid_NLD    0.000069
293  country_United Kingdom    0.000055
159         country_Austria    0.000048

[301 rows x 2 columns]
