# Life Expentancy Determinator by Countries Worldwide

In [None]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Preprocessing the Data for PCA
* Here we will need load, clean up, and scale the dataset
* Current cleanup steps we can determine are:
    * Remove features that do not apply to all countries
    * Focus on a subset of more recent years (maybe from 2010) forward as data is more prevalent in those years
    * Remove features that do not have hardy data
    * Determine if we should rationalize certain features where data does not exist
    * Group data by years then take the mean so that data is one value per feature

In [None]:
# Load the worldbank_data.csv dataset. Initially we will only be using a database to process and store data.
file_path = "Resources/worldbank_data.csv"
worldbank_df = pd.read_csv(file_path)
worldbank_df.head()

In [None]:
# Standardize the data with StandardScaler().
scaler = StandardScaler().fit(x)
x_scaled = scaler.transform(x)
x_scaled

# Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
x_pca = pca.fit_transform(x_scaled)

In [None]:
# Create a DataFrame with the three principal components.  Will need to test how many PC's is best fit, could be more than 3.
pcs_df = pd.DataFrame(data=x_pca, columns=["PC1", "PC2", "PC3"])
pcs_df.head()

### Clustering Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=?`

In [None]:
# Initialize the K-Means model. Define clusters before running
model = KMeans(n_clusters=?, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
predictions

In [None]:
# Concatentate the worldbank_df and pcs_df DataFrames on the same columns.
merged_df= pd.merge(worldbank_df, pcs_df, left_index=True, right_index=True)
merged_df

In [None]:
#  Add a new column, "Class" to the merged_df DataFrame that holds the predictions.
mergeddf["class"] = model.labels_
pcs_df.head()

# Print the shape of the merged_df
print(merged_df.shape)
merged_df.head(10)

### Visualizing Results

#### 3D-Scatter with Clusters

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(merged_df, x='PC1', y='PC2', z='PC3',color='class', hover_name='x', hover_data=['y'])
fig.show()

In [None]:
# Scaling data to create the scatter plot
X_cluster = merged_df[['x', 'y']].copy()
X_cluster_scaled = MinMaxScaler().fit_transform(X_cluster)
X_cluster_scaled

In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = pd.DataFrame(X_cluster_scaled, columns=['x', y'], index=clustered_df.index)
plot_df.head()

In [None]:
# Create a hvplot.scatter plot using x="x" and y="y".
plot_df.hvplot.scatter(
    x="x",
    y="y",
    hover_cols=["Define"],
    by="Class",
)
