In [3]:
# Import necessary libraries
import pandas as pd
from google.colab import files
import plotnine as p9
import plotly.express as px

# Upload the CSV file from local system
uploaded = files.upload()

# Read the uploaded CSV into a DataFrame
df = pd.read_csv('2019.csv')

# Display the first few rows of the dataset
print("Head of the dataset:")
print(df.head())

# Show dataset information including data types and missing values
print("\nDataset info:")
print(df.info())

# Show basic statistics for numerical columns
print("\nStatistical summary:")
print(df.describe())


Saving 2019.csv to 2019.csv
Head of the dataset:
   Overall rank Country or region  Score  GDP per capita  Social support  \
0             1           Finland  7.769           1.340           1.587   
1             2           Denmark  7.600           1.383           1.573   
2             3            Norway  7.554           1.488           1.582   
3             4           Iceland  7.494           1.380           1.624   
4             5       Netherlands  7.488           1.396           1.522   

   Healthy life expectancy  Freedom to make life choices  Generosity  \
0                    0.986                         0.596       0.153   
1                    0.996                         0.592       0.252   
2                    1.028                         0.603       0.271   
3                    1.026                         0.591       0.354   
4                    0.999                         0.557       0.322   

   Perceptions of corruption  
0                      0.393  

In [4]:
# Remove leading/trailing spaces and convert country names to title case for consistency
df['Country or region'] = df['Country or region'].str.strip().str.title()

# Rename columns to snake_case for easier access in code
df.rename(columns={
    'Overall rank': 'overall_rank',
    'Country or region': 'country',
    'Score': 'score',
    'GDP per capita': 'gdp_per_capita',
    'Social support': 'social_support',
    'Healthy life expectancy': 'healthy_life_expectancy',
    'Freedom to make life choices': 'freedom',
    'Generosity': 'generosity',
    'Perceptions of corruption': 'corruption'
}, inplace=True)

# Check and remove duplicate rows if any
df = df.drop_duplicates()

# Replace zero values in selected numeric columns with NaN (if zero is not a valid value)
cols_to_check = ['gdp_per_capita', 'social_support', 'healthy_life_expectancy', 'freedom', 'generosity', 'corruption']
df[cols_to_check] = df[cols_to_check].replace(0, pd.NA)

# Optional: convert 'country' column to categorical to save memory and improve performance
df['country'] = df['country'].astype('category')

# Display updated info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   overall_rank             156 non-null    int64   
 1   country                  156 non-null    category
 2   score                    156 non-null    float64 
 3   gdp_per_capita           155 non-null    object  
 4   social_support           155 non-null    object  
 5   healthy_life_expectancy  155 non-null    object  
 6   freedom                  155 non-null    object  
 7   generosity               155 non-null    object  
 8   corruption               155 non-null    object  
dtypes: category(1), float64(1), int64(1), object(6)
memory usage: 15.5+ KB


In [5]:
# Convert object columns back to numeric (float) after replacing 0 with NA
numeric_cols = [
    'gdp_per_capita', 'social_support', 'healthy_life_expectancy',
    'freedom', 'generosity', 'corruption'
]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   overall_rank             156 non-null    int64   
 1   country                  156 non-null    category
 2   score                    156 non-null    float64 
 3   gdp_per_capita           155 non-null    float64 
 4   social_support           155 non-null    float64 
 5   healthy_life_expectancy  155 non-null    float64 
 6   freedom                  155 non-null    float64 
 7   generosity               155 non-null    float64 
 8   corruption               155 non-null    float64 
dtypes: category(1), float64(7), int64(1)
memory usage: 15.5 KB


In [6]:
# Display the shape of the dataset
print("Shape of the dataset:")
print(df.shape)

# Display column names
print("\nColumn names:")
print(df.columns.tolist())

# Summary statistics for all numeric columns
print("\nDescriptive statistics:")
print(df.describe())

# Check for missing values
print("\nMissing values per column:")
print(df.isna().sum())

# Unique values in 'country'
print("\nNumber of unique countries:")
print(df['country'].nunique())

# Top 10 happiest countries
print("\nTop 10 happiest countries:")
print(df.sort_values(by='score', ascending=False)[['country', 'score']].head(10))

# Bottom 10 countries
print("\nBottom 10 countries:")
print(df.sort_values(by='score', ascending=True)[['country', 'score']].head(10))

# Correlation matrix
print("\nCorrelation matrix:")
print(df.corr(numeric_only=True))

Shape of the dataset:
(156, 9)

Column names:
['overall_rank', 'country', 'score', 'gdp_per_capita', 'social_support', 'healthy_life_expectancy', 'freedom', 'generosity', 'corruption']

Descriptive statistics:
       overall_rank       score  gdp_per_capita  social_support  \
count    156.000000  156.000000      155.000000      155.000000   
mean      78.500000    5.407096        0.910987        1.216613   
std       45.177428    1.113120        0.392925        0.283808   
min        1.000000    2.853000        0.026000        0.378000   
25%       39.750000    4.544500        0.615000        1.057000   
50%       78.500000    5.379500        0.960000        1.274000   
75%      117.250000    6.184500        1.234000        1.453000   
max      156.000000    7.769000        1.684000        1.624000   

       healthy_life_expectancy     freedom  generosity  corruption  
count               155.000000  155.000000  155.000000  155.000000  
mean                  0.729923    0.395103    0.