In [124]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

In [187]:
#For now, I am dropping rows that have missing values, since DBSCAN only works when there are no NANs
people = people.dropna()
people

Unnamed: 0,name,roles,profession,processed date,Archive
21493,...-gal,{'source'},uš[die],76.1208,dead animal
1997,AN,{'source'},gu[cord],69.0100,dead animal
14402,AN,{'source'},guza[chair],68.1119,domesticated animal
13148,ARAD₂-mu,{'recipient'},ensik[ruler],79.0100,domesticated animal
13151,ARAD₂-mu,{'recipient'},ensik[ruler],79.0100,domesticated animal
2351,ARAD₂-mu,{'source'},maškim[administrator],63.0830,dead animal
21516,ARAD₂-mu,{'recipient'},sukkalmah[official],70.0800,domesticated animal
21527,ARAD₂-mu,{'recipient'},sukkalmah[official],74.1229,domesticated animal
17669,ARAD₂-mu,{'source'},maškim[administrator],74.1104,domesticated animal
18381,ARAD₂-mu,{'source'},maškim[administrator],70.1226,wild animal


In [181]:
people = pd.read_csv("people_Drehem.csv")
archives = pd.read_csv("archive_map.csv")
archives['PID'] = [int(archives['PID'][i][1:]) for i in np.arange(len(archives['PID']))]
people = pd.merge(people, archives, how='inner', left_on='p index', right_on='PID')
people = people.drop(['PID', 'normalized name', 'family', 'date name', 'p index'], axis=1)
people = people.sort_values(['name'])

Our people dataframe contains columns for the variables we want to use on our clustering algorithm. Our next step is to convert our categorical data into numerical data. We can do this by creating a binary matrix. 

In [188]:
people_cat = people.select_dtypes(include=[object])
people_cat['profession'] = people_cat['profession'].astype(str)
people_cat['roles'] = people_cat['roles'].astype(str)
people_cat.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,roles,profession,Archive
21493,...-gal,{'source'},uš[die],dead animal
1997,AN,{'source'},gu[cord],dead animal
14402,AN,{'source'},guza[chair],domesticated animal


In [189]:
# import preprocessing from sklearn
from sklearn import preprocessing

In [190]:
people_cat.columns

Index(['name', 'roles', 'profession', 'Archive'], dtype='object')

In [191]:
# TODO: create a LabelEncoder object and fit it to each feature in people_cat


# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
people_cat_2 = people_cat.apply(le.fit_transform)
people_cat_2

Unnamed: 0,name,roles,profession,Archive
21493,0,9,85,0
1997,1,9,28,0
14402,1,9,31,1
13148,2,5,23,1
13151,2,5,23,1
2351,2,9,51,0
21516,2,5,72,1
21527,2,5,72,1
17669,2,9,51,1
18381,2,9,51,6


In [192]:
# TODO: create a OneHotEncoder object, and fit it to all of people_cat

# 1. INSTANTIATE
enc = preprocessing.OneHotEncoder()

# 2. FIT
enc.fit(people_cat_2)

# 3. Transform
onehotlabels = enc.transform(people_cat_2).toarray()
onehotlabels.shape

# as you can see, you've the same number of rows 891
# but now you've so many more columns due to how we changed all the categorical data into numerical data

(2292, 659)

In [193]:
onehotlabels

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

We just converted our categorical data into a sparse matrix. We should add this matrix to our quantitative data. 

In [194]:
dates = [[people['processed date'].values[i]] for i in np.arange(len(people['processed date']))]
people_cat_quant = np.hstack((onehotlabels, dates))
people_cat_quant

array([[ 1.    ,  0.    ,  0.    , ...,  0.    ,  0.    , 76.1208],
       [ 0.    ,  1.    ,  0.    , ...,  0.    ,  0.    , 69.01  ],
       [ 0.    ,  1.    ,  0.    , ...,  0.    ,  0.    , 68.1119],
       ...,
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    , 73.0816],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    , 83.0217],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    , 72.0409]])

Our matrix is now ready to be passed through the DBSCAN algorithm. 

Let's try the algorithm with the name "šu-{d}šul-gi"

In [208]:
people.loc[people['name'] == "šu-{d}šul-gi"]
name_example = people_cat_quant[people_cat_quant[:,536] == 1,:]

In [201]:
ni = {'name': people.groupby(['name']).size().keys()}
names_and_indices = pd.DataFrame(data=ni)
names_and_indices.head()

Unnamed: 0,name
0,...-gal
1,AN
2,ARAD₂-mu
3,ARAD₂-{d}nanna
4,AŠ


In [211]:
clustering = DBSCAN(eps=2).fit(name_example)
clustering.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0, -1,
        0])