In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import nltk
import numpy as np
import re
import string
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# remove stop words
# remove links
# remove punctuation
# remove hashtags

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def preprocess(text):
    text = text.lower()
    text = strip_all_entities(strip_links(text))
    text = nltk.word_tokenize(text)
    text = " ".join([word for word in text if word not in nltk.corpus.stopwords.words('english')])

    return text

In [None]:
train_df = pd.read_csv("Data_English.csv")
train_df

In [None]:
train_texts = train_df['English'].apply(preprocess)
#train_labels = train_df["target"]

del [train_df]

In [None]:
seq_len = 40
num_samples = len(train_texts)

Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

#labels = train_labels.to_numpy()
#labels = np.expand_dims(labels, axis=0).T

Xids.shape

In [None]:
from transformers import BertTokenizer
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, phrase in enumerate(tqdm(train_texts)):
    token = tokenizer.encode_plus(
        phrase, max_length=seq_len, add_special_tokens=True, 
        padding="max_length", truncation=True, return_tensors='tf')

    Xids[i, :] = token['input_ids']
    Xmask[i, :] = token['attention_mask']

<hr>

In [None]:
def map_func(inputs_ids, masks):
    return {
        'input_ids': inputs_ids,
        'attention_mask': masks
    }

In [None]:

batch_size = 8
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask))
dataset = dataset.map(map_func)
dataset = dataset.batch(batch_size)
dataset.take(1)

In [None]:
from transformers import TFAutoModel

# pretrained bert weights
bert = TFAutoModel.from_pretrained('bert-base-uncased')

# make untrainable
bert.trainable = False

In [None]:
input_ids = keras.layers.Input(shape=(seq_len,), name="input_ids", dtype="int32")
attention_mask = keras.layers.Input(shape=(seq_len,), name="attention_mask", dtype="int32")

# encode meaning of sentence
embeddings = bert.bert(input_ids, attention_mask=attention_mask)[1]

#x = layers.Dense(1024, activation="relu")(embeddings)
#x = layers.Dropout(0.5)(x)
#x = layers.Dense(1, activation="sigmoid")(x)

In [None]:
model = keras.Model(inputs=[input_ids, attention_mask], outputs=embeddings)

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer= 'adam',
    loss='binary_crossentropy',
    
)

In [None]:
outputs = model.predict(dataset)

In [None]:

pkl.dump(outputs, open("embeddings.pkl", "wb"))

In [None]:
train_df["embeddings"] = outputs.tolist()

In [None]:
def mode_embedding(embeddings):
    embeddings = np.array(embeddings)
    user_embeddings = np.sum(embeddings, axis=0)
    return user_embeddings

In [None]:
mode_embedding(train_df[train_df["Username"]=="000kiran_"]["embeddings"].to_list()).shape

In [None]:
embeddings = np.zeros((len(train_df["Username"].unique()), 768))

for i, username in enumerate(tqdm(train_df["Username"].unique())):
    embeddings[i, :] = mode_embedding(train_df[train_df["Username"]==username]["embeddings"].to_list())

In [None]:
pkl.dump(embeddings, open("user embeddings.pkl", "wb"))

In [None]:
import pickle as pkl

doc_embeddings = pkl.load(open("embeddings.pkl", "rb"))
user_embeddings = pkl.load(open("user embeddings.pkl", "rb"))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from kneed import KneeLocator
import seaborn as sns

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(user_embeddings)

In [None]:
pd.DataFrame({"username":train_df["Username"].unique(), "cluster":kmeans.labels_}).to_csv("Clustered users.csv", index=False)

In [None]:
from sklearn.preprocessing import StandardScaler  # to standardize the features
from sklearn.decomposition import PCA  # to apply PCA

In [None]:
user_clusters = pd.read_csv("Clustered users.csv")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import pickle as pkl
import pandas as pd

# doc_embeddings = pkl.load(open("embeddings.pkl", "rb"))
user_embeddings = pkl.load(open("user embeddings.pkl", "rb"))
user_clusters = pd.read_csv("Clustered users.csv")

scalar = StandardScaler()
scaled_data = pd.DataFrame(scalar.fit_transform(user_embeddings)) #scaling the data
# scaled_data = user_embeddings

kmeans = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(scaled_data)
clusters = kmeans.labels_

pca = PCA(n_components=3)
reduced = pca.fit_transform(scaled_data)

# We need a 2 x 944 array, not 944 by 2 (all X coordinates in one list)
t = reduced.transpose()

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(t[0, :], t[1, :], t[2, :], c=clusters, cmap="viridis")
plt.show()


pca = PCA(n_components=2)
reduced = pca.fit_transform(scaled_data)

# We need a 2 x 944 array, not 944 by 2 (all X coordinates in one list)
t = reduced.transpose()

plt.scatter(t[0], t[1], c=clusters)
plt.show()

print("silhouette score:", silhouette_score(scaled_data, kmeans.predict(scaled_data)))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from kneed import KneeLocator
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle as pkl
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

def preprocess(df):
    """Preprocess data for KMeans clustering"""
    
    # df_log = np.log1p(df)
    scaler = StandardScaler()
    
    return scaler.fit_transform(df)

def find_k(df, increment=0, decrement=0):
    """Find the optimum k clusters"""
    
    # df_norm = preprocess(df)
    df_norm = df
    sse = {}
    
    for k in tqdm(list(range(1, 10))):
        kmeans = KMeans(n_clusters=k, random_state=1)
        kmeans.fit(df_norm)
        sse[k] = kmeans.inertia_
    
    kn = KneeLocator(x=list(sse.keys()), 
                 y=list(sse.values()), 
                 curve='convex', 
                 direction='decreasing')
    k = kn.knee + increment - decrement
    return k

doc_embeddings = pkl.load(open("embeddings.pkl", "rb"))
user_clusters = pd.read_csv("Clustered users.csv")
doc_clusters = pd.read_csv("Data_English.csv")

num_clusters = find_k(doc_embeddings)
num_clusters

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

import pickle as pkl
import pandas as pd

doc_embeddings = pkl.load(open("embeddings.pkl", "rb"))
user_clusters = pd.read_csv("Clustered users.csv")
doc_clusters = pd.read_csv("Data_English.csv")

scalar = StandardScaler()
# scaled_data = pd.DataFrame(scalar.fit_transform(doc_embeddings)) #scaling the data
scaled_data = doc_embeddings

kmeans = KMeans(n_clusters=3, n_init="auto").fit(scaled_data)
clusters = kmeans.labels_

pca = PCA(n_components=3)
reduced = pca.fit_transform(scaled_data)

# We need a 2 x 944 array, not 944 by 2 (all X coordinates in one list)
t = reduced.transpose()

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(t[0, :], t[1, :], t[2, :], c=clusters, cmap="viridis")
plt.show()


pca = PCA(n_components=2)
reduced = pca.fit_transform(scaled_data)

# We need a 2 x 944 array, not 944 by 2 (all X coordinates in one list)
t = reduced.transpose()

plt.scatter(t[0], t[1], c=clusters)
plt.show()

print("silhouette score:", silhouette_score(scaled_data, kmeans.predict(scaled_data)))

In [None]:
doc_clusters = pd.read_csv("Data_English.csv")


In [None]:
doc_clusters["clusters"] = clusters
doc_clusters

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from tqdm import tqdm
import string
import re

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def preprocess(text):
    text = text.lower()
    text = strip_all_entities(strip_links(text))

    return text

def preprocess_text(text):
    text = preprocess(text)

    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = tokens#[word for word in tokens if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return lemmatized_tokens

def get_word_count(text_list):
    word_count = Counter(text_list)
    return word_count

for i in range(3):
    # Read the CSV file into a pandas DataFrame
    df = doc_clusters[doc_clusters["clusters"] == i]

    # Assuming the column with the texts is named 'text_column'
    texts = df['English']

    # Preprocess the texts
    preprocessed_texts = [preprocess_text(item) for item in tqdm(texts, f"preprocess cluster {i}")]

    # Flatten the list of preprocessed tokens
    all_tokens = [token for sublist in preprocessed_texts for token in sublist]

    # Get the word count
    word_count = get_word_count(all_tokens)

    # Convert word_count dictionary to a DataFrame
    word_count_df = pd.DataFrame(list(word_count.items()), columns=['Word', 'Count'])

    # Save word count DataFrame to a CSV file
    word_count_df.to_csv(f'word_count_cluster_{i}.csv', index=False)

In [None]:
# Read the CSV file into a pandas DataFrame
df = doc_clusters[doc_clusters["clusters"] == i]

# Assuming the column with the texts is named 'text_column'
texts = df['English']

# Preprocess the texts
preprocessed_texts = [preprocess_text(item) for item in tqdm(texts, f"preprocess cluster {i}")]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from kneed import KneeLocator
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle as pkl
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

def preprocess(df):
    """Preprocess data for KMeans clustering"""
    
    # df_log = np.log1p(df)
    scaler = StandardScaler()
    
    return scaler.fit_transform(df)

def find_k(df, increment=0, decrement=0):
    """Find the optimum k clusters"""
    
    # df_norm = preprocess(df)
    df_norm = df
    sse = {}
    
    for k in tqdm(list(range(1, 10))):
        kmeans = KMeans(n_clusters=k, random_state=1)
        kmeans.fit(df_norm)
        sse[k] = kmeans.inertia_
    
    kn = KneeLocator(x=list(sse.keys()), 
                 y=list(sse.values()), 
                 curve='convex', 
                 direction='decreasing')
    k = kn.knee + increment - decrement
    return k

doc_embeddings = pkl.load(open("embeddings.pkl", "rb"))
user_clusters = pd.read_csv("Clustered users.csv")
doc_clusters = pd.read_csv("Data_English.csv")

num_clusters = find_k(doc_embeddings)
num_clusters

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import pickle as pkl
doc_embeddings = pkl.load(open("embeddings.pkl", "rb"))
# user_clusters = pd.read_csv("Clustered users.csv")
# doc_clusters = pd.read_csv("Data_English.csv")

In [None]:
cos_sim = cosine_similarity(doc_embeddings)

In [None]:
del [doc_embeddings]

In [None]:
kmeans = KMeans(n_clusters= 3).fit(cos_sim)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import pickle as pkl
doc_embeddings = pkl.load(open("embeddings.pkl", "rb"))
# user_clusters = pd.read_csv("Clustered users.csv")
# doc_clusters = pd.read_csv("Data_English.csv")
model = DBSCAN(eps=0.01, min_samples=1000, metric="cosine").fit(doc_embeddings)
clusters = model.labels_

pca = PCA(n_components=3)
reduced = pca.fit_transform(doc_embeddings)

# We need a 2 x 944 array, not 944 by 2 (all X coordinates in one list)
t = reduced.transpose()

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(t[0, :], t[1, :], t[2, :], c=clusters, cmap="viridis")
plt.show()

In [None]:
X = doc_embeddings
length = np.sqrt((X**2).sum(axis=1))[:,None]
X = X / length

kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
clusters = kmeans.labels_

pca = PCA(n_components=3)
reduced = pca.fit_transform(X)

# We need a 2 x 944 array, not 944 by 2 (all X coordinates in one list)
t = reduced.transpose()

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(t[0, :], t[1, :], t[2, :], c=clusters, cmap="viridis")
plt.show()


pca = PCA(n_components=2)
reduced = pca.fit_transform(X)

# We need a 2 x 944 array, not 944 by 2 (all X coordinates in one list)
t = reduced.transpose()

plt.scatter(t[0], t[1], c=clusters)
plt.show()

print("silhouette score:", silhouette_score(X, kmeans.predict(X)))