Clustering attempts

In [None]:
#import packages
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [None]:
#load in data, and merge
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

emissions = pd.read_csv(os.path.join(base_dir, "data", "co2_emissions.csv"))
print(emissions.head())

methane = pd.read_csv(os.path.join(base_dir, "data", "methane_emissions.csv"))
print(methane.head())

forest = pd.read_csv(os.path.join(base_dir, "data", "net_forest.csv"))
print(forest.head())

surface = pd.read_csv(os.path.join(base_dir, "data", "surface_temperature.csv"))
print(surface.head())

merge = emissions.merge(methane, on=['country_code', "year"], how="outer")
merge = merge.merge(forest, on=['country_code', "year"], how="outer")
merge = merge.merge(surface, on=['country_code', "year"], how="outer")

Functions for clustering

In [None]:
#this function performs gonzales clustering, it assumes data points to be numpy vectors
def gonazalesCluster(data, numClusters):
  #initalize first center to be first point
  centroids = [data[0]]
  
  #set the cluster mapping of all to be cluster 1
  cluster = []
  for i in range(len(data)):
    cluster.append(0)


  #for i = 2 to k 
  for i in range(1, numClusters):
    #set m=0 and ci = x1
    m=0
    ci = data[0]
    #for each datapoint
    for j in range(len(data)):
      #compute distance from point to mapped cluster
      dist = np.linalg.norm(data[j]-centroids[cluster[j]])
  
        #if this distance is greater than m, set m and ci = point
      if dist>m:
        m= dist
        ci = data[j]
    centroids.append(ci)
    #for each datapoint
    for j in range(len(data)):
      #if the distance to its current cluster is greater than the distance to the new cluster
      if (np.linalg.norm(data[j]-centroids[cluster[j]]) > np.linalg.norm(data[j]-ci)):
  
        #set the mapped cluster of this point to be the new cluster
        cluster[j]=i

  #compute 3 center cost max
  #andcomputer the 3 means cost
  sum = 0
  maxCenter = 0
  for i in range(len(data)):
    dist = np.linalg.norm(data[i] - centroids[cluster[i]])
    sum = sum + (dist * dist)
    if dist > maxCenter:
      maxCenter = dist

  meanCost = sum / len(data)
  

  return centroids, cluster, maxCenter, meanCost

#this function performs gonzales clustering, it assumes data points to be numpy vectors
def kPlusPlus(data, numClusters):
  #set the first cluster to be the first point
  centroids = [data[0]]
  
  #set the cluster mapping of all to be cluster 1
  cluster = []
  for i in range(len(data)):
    cluster.append(0)

  #for k = 1 to numClusters:
  for k in range(1, numClusters):
    #initalize a sum
    sum = 0
  
    #intialize a probability distribution
    density = []

    #choose ci from x with probability proportional two the squared distance of X and the previous cluster
    #iterate through the points
    for i in range(len(data)):
  
      #compute the squared distance to the point and the cluster
      dist = np.linalg.norm(data[i] - centroids[cluster[i]])
      dist = dist **2
  
      #add it to the sum
      sum = sum+dist
  
      #append the sum to the probability distirbution
      density.append(sum)
  
    #compute a random number between 0 and sum
    rand = np.random.uniform(0, sum)
    nextCluster = []
    #iterate through the distribution until you hit a point that is larger, then that index becomes the new cluster
    for i in range(len(data)):
      if density[i] < rand:
        continue
      nextCluster = data[i]
      break

    centroids.append(nextCluster)
  

    #set the new clusters
    for j in range(len(data)):
      #if the distance to its current cluster is greater than the distance to the new cluster
      if (np.linalg.norm(data[j]-centroids[cluster[j]]) > np.linalg.norm(data[j]-nextCluster)):
  
        #set the mapped cluster of this point to be the new cluster
        cluster[j]=k

    
  
  #compute 3 center cost max
  #and compute  the 3 means cost
  sum = 0
  maxCenter = 0
  for i in range(len(data)):
    dist = np.linalg.norm(data[i] - centroids[cluster[i]])
    sum = sum + (dist * dist)
    if dist > maxCenter:
      maxCenter = dist

  meanCost = sum / len(data)
    

  return centroids, cluster, maxCenter, meanCost

#this function performs gonzales clustering, it assumes data points to be numpy vectors
def lloyds(data, numClusters, centroids = None):

  #if beginning clusters weren't set, pick clusters in order
  #set the first cluster to be the first point

  if centroids is None:
    centroids = []
    for i in range(0, numClusters):
      centroids.append(data[i])
  
  #set the cluster mapping of all to be cluster 1
  cluster = []
  for i in range(len(data)):
    cluster.append(0)

  #define the number of points changed to be 1
  numChanged = 1

  #while numPoints changed > 0:
  while numChanged > 0:

    #set numChanged = 0
    numChanged = 0

    #compute the new clostest cluster, if its different, increase numChanged
    for i in range(len(data)):
      m = 1000000000000000000000000
      newClust = 0
      for j in range(len(centroids)):
        dist = np.linalg.norm(data[i] - centroids[j])
        if dist < m:
          m = dist
          newClust = j
      if cluster[i] != newClust:
        numChanged = numChanged+1
      cluster[i] = newClust

    #compute a new cluster value as the average of all the points
    for j in range(len(centroids)):
      sumPoint = np.zeros(len(data[0]))
      numPoints =0
      for i in range(len(data)):
        if cluster[i] == j:
          numPoints = numPoints+1
          sumPoint = sumPoint+data[i]

      meanPoint = sumPoint/numPoints
      centroids[j] = meanPoint


  #compute 3 means cost
  #and compute  the 3 means cost
  sum = 0
  for i in range(len(data)):
    dist = np.linalg.norm(data[i] - centroids[cluster[i]])
    sum = sum + (dist * dist)


  meanCost = sum / len(data)

  #return 
  return centroids, cluster, meanCost

#this functions will grphs the clusters, given the, cluster mapping and array of numpy vectors
def graphCluster(data, clusters):

  #intialize a plot
  clusterColors = ['Red', "Blue", "Green", "Yellow", "Purple", "Orange", "Teal", "Pink", "Black", "Grey", "Brown"]
  xs = []
  ys = []
  pointColors = []
  for i in range(len(data)):
    xs.append(data[i][0])
    ys.append(data[i][1])
    pointColors.append(clusterColors[clusters[i]])

  plt.scatter(xs, ys, c= pointColors)
    
  plt.title('Scatter Plot of lloyds Clusters with gonazels starting clusters')

  plt.show()

To identify anomalies, we should attempt some forms of clustering on this data. We can cluster based on emissions and deforestation, and validate the efficacy by using surface value. 

In [None]:
#transform the datasets into arrays of numpy arrays

#run and graph clusters

#validate clusters with surface values. 