
# New Combined GLSS Data Set

We now have a full data set that combines Income, Lighting and Deforestation Data linked by latitutude and longitude information. 

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import sklearn
%matplotlib inline

from sklearn.decomposition import PCA

In [2]:
# Dataset location
maindb = "glss4_5_6_alignedfclmal_nl.csv"

# Read in a CSV file and store the contents in a dataframe (df)
df2 = pd.read_csv(maindb, low_memory=False, sep=',')
df2.head()


FileNotFoundError: File b'C:\\dev\\mining\\maindataset\\glss4_5_6_alignedfclmal_nl.csv' does not exist

In [None]:
df2.shape

In [None]:
corrdf = df2.corr()

### Create a Correlation Heatmap

In [None]:
#Correlation Matrix

corr = (corrdf)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
# sns.title('Heatmap of Correlation Matrix')
corr

In [None]:
plt.show()

## Split Data
Split data column-wise to better visualize the uncorrelated feature, taking approximately the first 30 % of features.

In [None]:
dfA = pd.DataFrame(df2[df2.columns[0:190]])

In [None]:
#Correlation Matrix
corr2 = dfA.corr()
corr = (corr2)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
# sns.title('Heatmap of Correlation Matrix')
corr

In [None]:
dfA.columns

In [None]:
# Check to see if there are any missing values in our data set
dfA.isnull().any()

In [None]:
dfA.describe()

In [None]:
dfA.dtypes

In [None]:
dfA.to_csv('200FeatureRedMainData.csv')

In [None]:
dfA.shape

### Dropping Geodata

We are dropping all geographical data, so we do consider it in the unsupervised clustering that we are going to perform later on the data. The clustering should be based on other features than location. 

In [None]:
dfB = dfA.drop(['hhid', 'region', 'clust', 'nh', 'pid', 'x', 'y', 'eanum', 'reg_code', 'dist_name', 'reg_name'], axis=1)

In [None]:
dfB.shape

### PCA

We want to further reduce these 179 columns with PCA so we have a smaller number of features to work with. 

For this, we will use a dataset that already mapped categorical columns to numerical ones from the dataset above but still have the 700 columns.

In [None]:
# Dataset location
maindb = "glss4_5_6_alignedfclmal_nl_nolabels.csv"

# Read in a CSV file and store the contents in a dataframe (df)
dfnstr = pd.read_csv(maindb, low_memory=False, sep=',')
dfnstr.head()

In [None]:
dfnstrB = dfnstr.drop(['hhid', 'region', 'clust', 'nh', 'pid', 'x', 'y', 'eanum', 'reg_code', 'dist_name', 'reg_name'], axis=1)

In [None]:
dfnstrC = dfnstrB
dfnstrC[np.isnan(dfnstrC)] = -1
dfnstrC.head()

In [None]:
pca = PCA()

In [None]:
pca.fit(dfnstrC)

In [None]:
print(pca.explained_variance_)

In [None]:
# We want to figure out how many columns according to PCA provide relevant information by thresholding on the variance. Here, the threshold is put at 10, but is subject to variation.
sum(pca.explained_variance_ > 10)

In [None]:
# Reduce the number of columns of above data frame to 107
pca.n_components = 107
X_reduced = pca.fit_transform(dfnstrC)
X_reduced.shape

In [None]:
# To map it back to the original form, i.e. 689 columns
X_redinverse = pca.inverse_transform(X_reduced)
X_redinverse.shape
print(X_redinverse)

## K-Means Clustering

### The following code is only pasted as placeholder and not tested yet.

In [None]:
#from time import time

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale

np.random.seed(42)


n_samples, n_features = X_reduced.shape
#n_digits = len(np.unique(digits.target))
#labels = digits.target
n_digits = 2


sample_size = 300

print("n_samples %d, \t n_features %d"
      % (n_samples, n_features))





def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
'''    
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))



bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
              name="k-means++", data=data)

bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
              name="random", data=data)

# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=n_digits).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
              name="PCA-based",
              data=data)
print(82 * '_')
'''
# #############################################################################
# Visualize the results on PCA-reduced data

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()