In [62]:
#Check to see if gpu is available, should print 1
import tensorflow as tf

print("Num GPUs Available:", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available: 1


In [63]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, LeakyReLU

In [64]:
df = pd.read_csv("train.csv")
df.sample(10).head(10)

Unnamed: 0,cuneiform,lang
25368,𒉈𒂗𒉌,NEA
88875,𒄯𒊑,SUX
84216,𒂊𒁍,STB
80007,𒌑𒀸𒅖𒋛𒀀,STB
81117,𒁁𒊩𒋚𒊹𒅇𒌅𒀉𒁮𒈗𒁹𒊮𒌷𒄿𒉺𒄷𒊒,STB
133032,𒃻𒋩,SUX
33502,𒀜𒋾𒄿𒀀𒈾𒈨𒉌𒌅𒍝𒈾𒇷𒉌,NEA
14375,𒊹,LTB
58030,𒀭𒀝𒌋𒀭𒀫𒌓,NEA
132205,𒃵,SUX


In [65]:
df.tail(10)

Unnamed: 0,cuneiform,lang
139411,𒄑𒀳𒋗𒌌,SUX
139412,𒄑𒀳𒋗𒉡𒌌,SUX
139413,𒄑𒀳𒄭𒆥,SUX
139414,𒄑𒀳𒀠𒍣𒊏,SUX
139415,𒄑𒀳𒋗,SUX
139416,𒄑𒀳𒋗𒌌,SUX
139417,𒄑𒀳𒋗𒉡𒌌,SUX
139418,𒄑𒀳𒄭𒆥,SUX
139419,𒄑𒀳𒀠𒍣𒊏,SUX
139420,𒄑𒀳𒋗,SUX


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139421 entries, 0 to 139420
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   cuneiform  139421 non-null  object
 1   lang       139421 non-null  object
dtypes: object(2)
memory usage: 2.1+ MB


In [67]:
df.lang.value_counts()

lang
SUX    53673
NEA    32966
STB    17817
LTB    15947
NEB     9707
MPB     5508
OLB     3803
Name: count, dtype: int64

In [68]:
#Describe, 7 Different languages, 3223 samples are just the black circle, which is cuneiform for 3,600 or many or totality or world
#Cuneiform uses a 6 based numbering system, so 3,600 is 60*60 and not an arbitrary number or a dataset mistake

#No one hot encoding is needed, however, label is needed to be converted to integer value
#custom map to easily convert back at end
label_map = {'SUX': 0, 'NEA': 1, 'STB': 2, 'LTB': 3, 'NEB': 4, 'MPB': 5, 'OLB': 6}
df['enc_lang'] = df['lang'].map(label_map)
#ensure correct
df.sample(10).head(10)

Unnamed: 0,cuneiform,lang,enc_lang
12795,𒁹𒆷𒁀𒅆,LTB,3
110613,𒇻𒅗𒀀,SUX,0
101347,𒂍𒈨,SUX,0
32219,𒀲𒀴𒆷𒂊𒈬𒋡𒋙𒌋𒉌,NEA,1
40991,𒋃𒆥𒌅𒀀𒊒𒁲𒂊𒉡,NEA,1
21771,𒀭𒌓𒅀𒀭𒈨𒌍𒅀,MPB,5
35851,𒌦𒈨𒌍𒋧𒅟𒉡𒉌,NEA,1
20405,𒀀𒉡𒌝𒈠𒀸𒋼𒈨𒀀𒉿𒌓𒊭𒋫𒀸𒁍𒊏𒌋𒅗𒊺𒀀𒈾𒅀𒅆,MPB,5
2958,𒀸𒄑𒈠𒅆𒄷,LTB,3
131692,𒀵𒍪,SUX,0


In [69]:
#drop lang label
df.drop(columns=['lang'], inplace=True)
df.sample(10).head(10)

Unnamed: 0,cuneiform,enc_lang
32189,𒀀𒉺𒅁𒊏𒄿𒊑𒅟,1
51259,𒊭𒊹𒁀𒆷𒋛𒄿,1
815,𒄿𒌑𒅖𒌋𒌋𒉡𒅎,3
9935,𒊹,3
4551,𒊹,3
70265,𒌈𒈲𒌈𒊭𒀉𒋾𒀭𒀀𒉏,2
41939,𒇽𒊺𒆗𒉺𒀀𒀀,1
26841,𒅆𒁹𒆷,1
72709,𒁶𒅎𒋛𒀀𒈨𒌍𒌑𒆤𒄭𒉿𒄢𒄢𒆷,2
112793,𒆕𒊩𒌨𒀳,0


In [70]:
#need to seperate up cuneiform into vectors, then one hot encode those vectors for each symbol
#function to decode on unicode
def split_on_unicode(cuneiform):
    return [*cuneiform] #splits cuneiform text into individual characters

#for each sample, split on each character and create a new feature
df['sep_cuneiform'] = df['cuneiform'].apply(split_on_unicode)

In [71]:
df.sample(2).head(2)

Unnamed: 0,cuneiform,enc_lang,sep_cuneiform
87415,𒀊𒍪𒀊𒄠𒃲𒅇𒈾𒁺𒁀𒈪𒉭𒂵𒁉𒁀𒆕,0,"[𒀊, 𒍪, 𒀊, 𒄠, 𒃲, 𒅇, 𒈾, 𒁺, 𒁀, 𒈪, 𒉭, 𒂵, 𒁉, 𒁀, 𒆕]"
65709,𒌗𒀊𒆳𒉏𒈠𒆠,4,"[𒌗, 𒀊, 𒆳, 𒉏, 𒈠, 𒆠]"


In [72]:
#get unique cuneiform symbols, one hot encode
unique_cuneiform = set()
for sublist in df['sep_cuneiform']:
    for char in sublist:
        unique_cuneiform.add(char)
        
unique_cuneiform_list = list(unique_cuneiform)
uniq_cuneiform_df = pd.DataFrame(0, index=df.index, columns=unique_cuneiform_list)

for index, row in df.iterrows(): #iterate through each row in dataframe
    for char in row['sep_cuneiform']: #iterate through sep_cuneiform array
        uniq_cuneiform_df.at[index, char] += 1 #update intersection, add one to the uniq cuneiform df at the right spot
        #change above to one_hot if running into issues

df = pd.concat([df, uniq_cuneiform_df], axis=1)

In [131]:
df.sample(5).head(5)

Unnamed: 0,enc_lang,𒂉,𒄌,𒊩,𒄄,𒓎,𒁽,𒅫,𒂏,𒉬,...,𒋥,𒅅,𒂟,𒀮,𒁑,𒅎,𒇇,𒂙,𒀛,𒌺
91503,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18410,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31583,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
131832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107930,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
df.drop(['cuneiform', 'sep_cuneiform'], axis=1, inplace=True)
df.sample(5).head(5)

Unnamed: 0,enc_lang,𒂉,𒄌,𒊩,𒄄,𒓎,𒁽,𒅫,𒂏,𒉬,...,𒋥,𒅅,𒂟,𒀮,𒁑,𒅎,𒇇,𒂙,𒀛,𒌺
46146,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101879,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13191,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21649,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108761,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
#Stratify and split, 80% train, 20% test
X_train, X_test = train_test_split(df,test_size=.2,stratify=df['enc_lang'])

In [108]:
#Print sizes, only two labels, the cuneiform text and the language
print("X train shape -", X_train.shape)
print("X test shape -", X_test.shape)
print(X_train.head())
print(X_test.head())

X train shape - (111536, 2)
X test shape - (27885, 2)
                 cuneiform  enc_lang
130937                  𒍜𒋓         0
63008                    𒋙         4
1191                     𒋫         3
111747                 𒈬𒈬𒀀         0
5838    𒁕𒋻𒋝𒈬𒋥𒀀𒇉𒀭𒈹𒊹𒌑𒍑𒆠𒌑𒅎𒍇𒇻𒁕         3
                                             cuneiform  enc_lang
110996                                               𒄑         0
117984                                              𒄭𒂵         0
97126                                            𒋆𒋛𒀀𒅆𒂟         0
60322   𒀭𒊩𒌆𒆠𒃲𒀸𒊭𒀜𒈬𒅆𒀸𒈧𒈪𒄿𒍣𒍝𒈠𒅅𒁉𒄿𒋙𒊹𒂗𒇻𒀬𒅟𒀀𒈾𒆪𒈠𒋢𒌒𒉿𒄿𒅟𒇻𒍑𒈨𒄭𒋚𒋫𒅟𒇻𒆗𒅆𒅟         4
55995                                   𒂅𒌒𒍜𒅇𒄷𒌓𒊮𒁉𒀭𒈨𒌍𒃲𒈨𒌍         1


In [109]:
# Create your labels for each dataset, which would be the language
y_train = X_train['enc_lang'].copy()
X_train.drop('enc_lang', axis=1, inplace=True)

y_test = X_test['enc_lang'].copy()
X_test.drop('enc_lang', axis=1, inplace=True)