# Azure assignment

The datasets that were used in this notebook can be found below:
    
Main dataset: https://www.kaggle.com/datasets/unsdsn/world-happiness</br>
Secondary dataset: https://www.kaggle.com/datasets/theworldbank/education-statistics

The main dataset consists of the world's happiness score which is determined based on various factors. As a secondary table, I chose to pick a column from the education statistics dataset which is 'Income Group'. I wanted whether the education level leading to the income group has any significant relationship when it comes to the world's happiness score per country.

In [None]:
try:
    import numpy as np
    print('NumPy already installed, only imported')
except:
    !pip install numpy
    import numpy as np
    print('NumPy was not installed, installed and imported')
    
try:
    import pandas as pd
    print('pandas already installed, only imported')
except:
    !pip install pandas
    import pandas as pd
    print('pandas was not installed, installed and imported')

try:
    import sklearn
    print('sklearn already installed, only imported')
except:
    !pip install scikit-learn
    import sklearn
    print('sklearn was not installed, installed and imported')

try: 
    import time    
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.feature_selection import mutual_info_regression
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_absolute_error
    from sklearn.svm import SVR
    print('All of the remaning libraries have been imported')
except: 
    print("Not all libraries have been imported correctly, please check again")

In [91]:
def execute_pipeline(happiness_DF, education_DF, new_data=None):
    #change names of the columns in both datasets so that it can be prepared for merging
    happiness_DF = happiness_DF.rename(columns={'Country or region': 'Country'})
    education_DF = education_DF.rename(columns={'Table Name': 'Country', 'Income Group': 'Income_Group'})
    
    #merge datasets based on the country (only merge income_group column which is our new feature from another dataset)
    merged_DF = pd.merge(happiness_DF, education_DF[['Country', 'Income_Group']], on='Country', how='left')
    
    #remove unnecessary columns
    merged_DF = merged_DF.drop(['Overall rank', 'Country'], axis=1)
    
    #check for missing values, if any, remove
    print(f"There are {merged_DF.isna().sum().sum()} missing values in your dataset")
    merged_DF = merged_DF.dropna()
    print(f"There are {merged_DF.isna().sum().sum()} missing left in the dataset")
    
    #if new data is not none (if it is picked) then merge it with the dataframe
    if new_data is not None:
        merged_DF = pd.concat([merged_DF, new_data], ignore_index=True)
    
    # make temporary dataframe that holds numerical values for categorical column (this is used for determine top features)
    encoded_DF = pd.get_dummies(merged_DF, columns=['Income_Group'])
    
    print("\nTop 3 Features using mutual information scores")
    X, y = encoded_DF.drop('Score', axis=1), merged_DF['Score']
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    print(mi_scores.head(3))
    
    print("\nTop 3 Features using feature importance")
    X, y = encoded_DF.drop('Score', axis=1), merged_DF['Score']
    model = RandomForestRegressor()
    model.fit(X, y)
    importances = model.feature_importances_
    sorted_indices = np.argsort(importances)[::-1]
    sorted_importances = importances[sorted_indices]
    sorted_feature_names = X.columns[sorted_indices]
    print(sorted_feature_names[:3])
    
    print("\nTop 3 Features using correlation matrix")
    correlation_matrix = merged_DF.corr(numeric_only=True)
    correlation_values = correlation_matrix['Score'].abs()
    sorted_correlation_values = correlation_values.sort_values(ascending=False)
    print(sorted_correlation_values.index[1:4])
    
    #create a numeric transformer to pre-process numerical features using an imputer and a scaler
    numeric_features = ["GDP per capita", "Social support", "Healthy life expectancy", "Freedom to make life choices"]
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    #create a categorical transformer to pre-process categorical features using an imputer and an encoder
    categorical_features = ["Income_Group"]
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))])
    
    # add the transformer to the pre-processor variable
    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', numeric_transformer, numeric_features),
            ('categorical', categorical_transformer, categorical_features)])
    
    # create a pipeline and append the Support Vector Regressor classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVR())])
    
    #split data into X (independent) and y (dependent)
    X = merged_DF[["GDP per capita", "Social support", "Healthy life expectancy", "Freedom to make life choices", "Income_Group"]]
    y = merged_DF['Score']
    
    #split data into train and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
    
    #train the model of the training data
    pipeline.fit(X_train, y_train)
    
    #evaluate the model's performance
    print("\nEvalution of the score/accuracy:")
    score = pipeline.score(X_test, y_test)
    print(f'The R-Squared score is: {score}')
    print("Accuracy: %.2f%%" % (score * 100))
    
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error: ", mse)

    mae = mean_absolute_error(y_test, y_pred)
    print("Mean Absolute Error:", mae)

In [92]:
start_time = time.time()

happiness_DF = pd.read_csv('world-happiness-2019.csv')
education_DF = pd.read_csv('education-statistics.csv')

execute_pipeline(happiness_DF, education_DF)

end_time = time.time()
execution_time = end_time - start_time
print("\nExecution time of the pipeline is:", execution_time, "seconds")

There are 20 missing values in your dataset
There are 0 missing left in the dataset

Top 3 Features using mutual information scores
Healthy life expectancy    0.609765
Social support             0.602643
GDP per capita             0.597983
dtype: float64

Top 3 Features using feature importance
Index(['Social support', 'Healthy life expectancy', 'GDP per capita'], dtype='object')

Top 3 Features using correlation matrix
Index(['GDP per capita', 'Healthy life expectancy', 'Social support'], dtype='object')

Evalution of the score/accuracy:
The R-Squared score is: 0.7859008720392056
Accuracy: 78.59%
Mean Squared Error:  0.19671829859073958
Mean Absolute Error: 0.3306771564431313

Execution time of the pipeline is: 0.24153804779052734 seconds


In [93]:
#create new data to fit into the pipeline
new_data = pd.DataFrame([
        {'Score': 7.142, 'GDP per capita': 1.125, 'Social support': 1.421, 
        'Healthy life expectancy': 0.898, 
        'Freedom to make life choices': 0.463, 'Generosity': 0.319, 
        'Perceptions of corruption': 0.512, 'Income_Group': 'High income: nonOECD'
        },
        {'Score': 6.322, 'GDP per capita': 1.315, 'Social support': 1.350, 
        'Healthy life expectancy': 0.923, 
        'Freedom to make life choices': 0.341, 'Generosity': 0.521, 
        'Perceptions of corruption': 0.456, 'Income_Group': 'Upper middle income'
        },
        {'Score': 5.331, 'GDP per capita': 1.032, 'Social support': 1.231, 
        'Healthy life expectancy': 1.002, 
        'Freedom to make life choices': 0.521, 'Generosity': 0.299, 
        'Perceptions of corruption': 0.441, 'Income_Group': 'Low income'
        }
    ])

In [94]:
start_time = time.time()

#execute the pipeness with the addition of new data
execute_pipeline(happiness_DF, education_DF, new_data)

end_time = time.time()
execution_time = end_time - start_time
print("\nExecution time of the pipeline is:", execution_time, "seconds")

#set new data variable back to none so that it does not get re-used when method is executed
new_data = None 

There are 20 missing values in your dataset
There are 0 missing left in the dataset

Top 3 Features using mutual information scores
Social support             0.596968
Healthy life expectancy    0.594566
GDP per capita             0.588017
dtype: float64

Top 3 Features using feature importance
Index(['Social support', 'GDP per capita', 'Healthy life expectancy'], dtype='object')

Top 3 Features using correlation matrix
Index(['GDP per capita', 'Healthy life expectancy', 'Social support'], dtype='object')

Evalution of the score/accuracy:
The R-Squared score is: 0.6064207536243638
Accuracy: 60.64%
Mean Squared Error:  0.35449261029003865
Mean Absolute Error: 0.47669080563396715

Execution time of the pipeline is: 0.23754668235778809 seconds
