In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sklearn
import pandas as pd

data = pd.read_csv('drive/MyDrive/data/RUG.csv')

lemSentences = data['lemmatized_outcome']
stemSentences = data['stemmed_outcome']
stemLemSentences = data['lemmatized_outcome_s']
lemSentences = lemSentences.apply(lambda x: ' '.join(x))
stemSentences = stemSentences.apply(lambda x:' '.join(x))
stemLemSentences = stemLemSentences.apply(lambda x:' '.join(x))

In [None]:
!pip install keybert
!pip install keybert[flair]
!pip install keybert[gensim]
!pip install keybert[spacy]
!pip install keybert[use]
from keybert import KeyBERT

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keybert
  Downloading keybert-0.5.1.tar.gz (19 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.9 MB/s 
Collecting rich>=10.4.0
  Downloading rich-12.4.4-py3-none-any.whl (232 kB)
[K     |████████████████████████████████| 232 kB 24.6 MB/s 
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 8.8 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 54.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 55.5 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  D

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Embedding**

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model2 = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
lemembeddings = model.encode(lemSentences)

In [None]:
print(lemembeddings)

In [None]:
stemembeddings = model.encode(stemSentences)

In [None]:
stemLemEmbeddings = model.encode(stemLemSentences)

# **Clustering**

In [None]:
lemKmeans = KMeans(7,random_state=0).fit(lemembeddings)
lemDbscan = DBSCAN(eps=0.5,min_samples=3,metric="euclidean").fit(lemembeddings)
lemClusterWard = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="ward").fit(lemembeddings)
lemClusterSingle = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="single").fit(lemembeddings)
lemClusterComplete = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="complete").fit(lemembeddings)

In [None]:
stemKmeans = KMeans(7,random_state=0).fit(stemembeddings)
stemDbscan = DBSCAN(eps=0.5,min_samples=3,metric="euclidean").fit(stemembeddings)
stemClusterWard = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="ward").fit(stemembeddings)
stemClusterSingle = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="single").fit(stemembeddings)
stemClusterComplete = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="complete").fit(stemembeddings)

In [None]:
stemlemKmeans = KMeans(7,random_state=0).fit(stemLemEmbeddings)
stemlemDbscan = DBSCAN(eps=0.5,min_samples=3,metric="euclidean").fit(stemLemEmbeddings)
stemlemClusterWard = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="ward").fit(stemLemEmbeddings)
stemlemClusterSingle = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="single").fit(stemLemEmbeddings)
stemlemClusterComplete = AgglomerativeClustering(n_clusters=7,affinity="euclidean",linkage="complete").fit(stemLemEmbeddings)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import rand_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

labels = LabelEncoder().fit_transform(data['Department'])

In [None]:
print('Lem Accuracy of Data Set: ')

print('Kmeans :' + str(rand_score(labels,lemKmeans.labels_)))
print('Dbscan : ' + str(rand_score(labels,lemDbscan.labels_)))
print('AGG Ward : ' + str(rand_score(labels,lemClusterWard.labels_)))
print('AGG Single :' + str(rand_score(labels,lemClusterSingle.labels_)))
print('AGG Complete : ' + str(rand_score(labels,lemClusterComplete.labels_)))

In [None]:
print('Stem Accuracy of Data Set: ')

print('Kmeans :' + str(rand_score(labels,stemKmeans.labels_)))
print('Dbscan : ' + str(rand_score(labels,stemDbscan.labels_)))
print('AGG Ward : ' + str(rand_score(labels,stemClusterWard.labels_)))
print('AGG Single :' + str(rand_score(labels,stemClusterSingle.labels_)))
print('AGG Complete : ' + str(rand_score(labels,stemClusterComplete.labels_)))

In [None]:
print('StemLem Accuracy of Data Set: ')

print('Kmeans :' + str(rand_score(labels,stemlemKmeans.labels_)))
print('Dbscan : ' + str(rand_score(labels,stemlemDbscan.labels_)))
print('AGG Ward : ' + str(rand_score(labels,stemlemClusterWard.labels_)))
print('AGG Single :' + str(rand_score(labels,stemlemClusterSingle.labels_)))
print('AGG Complete : ' + str(rand_score(labels,stemlemClusterComplete.labels_)))

# **Classification**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

#need to embed data for this to work.
clf = DecisionTreeClassifier(criterion="gini",max_depth=20,max_features="sqrt",random_state=0)

X_train, X_test, y_train, y_test = train_test_split( lemembeddings, labels, test_size=0.33, random_state=42)
X_trainStem, X_testStem, y_trainStem, y_testStem = train_test_split( stemembeddings, labels, test_size=0.33, random_state=42)
X_trainStemLem, X_testStemLem, y_trainStemLem, y_testStemLem = train_test_split( stemLemEmbeddings, labels, test_size=0.33, random_state=42)
clf = clf.fit(X_train,y_train)
re = rand_score(y_test,clf.predict(X_test))
print('lem accuracy gini tree is : ' + str(re))

In [None]:
clf = clf.fit(X_trainStem,y_trainStem)
re = rand_score(y_testStem,clf.predict(X_testStem))
print('stem accuracy gini tree is : ' + str(re))

In [None]:
clf = clf.fit(X_trainStemLem,y_trainStemLem)
re = rand_score(y_testStemLem,clf.predict(X_testStemLem))
print('stemLem accuracy gini tree is : ' + str(re))

In [None]:
from sklearn.naive_bayes import GaussianNB

clfBa = GaussianNB()

clfBa = clfBa.fit(X_train,y_train)

re = rand_score(y_test,clfBa.predict(X_test))
print('lem accuracy Bayes is : ' + str(re))

In [None]:
clfBa = clfBa.fit(X_trainStem,y_trainStem)

re = rand_score(y_testStem,clfBa.predict(X_testStem))
print('stem accuracy Bayes is : ' + str(re))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clfKNN = KNeighborsClassifier()
clfKNN = clfKNN.fit(X_train,y_train)

re = rand_score(y_test,clfKNN.predict(X_test))
print('lem accuracy KNN is : ' + str(re))

In [None]:
clfKNN = clfKNN.fit(X_trainStem,y_trainStem)

re = rand_score(y_testStem,clfKNN.predict(X_testStem))
print('stem accuracy KNN is : ' + str(re))