In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit, explode, posexplode, row_number, expr, collect_list
from pyspark.sql.types import ArrayType, DoubleType
from FlagEmbedding import BGEM3FlagModel

spark = SparkSession.builder.master('local[2]').config('spark.executor.memory','8g').config('spark.driver.memory','8g').getOrCreate()


In [33]:
import pandas as pd
import numpy as np

df = pd.read_csv('laicite_gd.csv', delimiter=";")
df['source'] = df['screen name']
dx = df[['text','source']]
dx2 = dx.dropna()
print(dx2.count())

def split_string(string, max_length=4096):
    return [string[i:i + max_length] for i in range(0, len(string), max_length)]

dx2['text'] = dx2['text'].apply(split_string)
dx2 = dx2.explode('text').reset_index(drop=True)

sent = dx2['text'].tolist()
s = [str(x) for x in sent]
len(s)


text      9999
source    9999
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dx2['text'] = dx2['text'].apply(split_string)


10000

In [7]:
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)
embeddings = model.encode(s, return_dense=True,max_length=4096)["dense_vecs"] # upto 8192 tokens
embeddings     

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

array([[-0.05966125,  0.01844968, -0.03975787, ...,  0.0197802 ,
        -0.04212459,  0.03838256],
       [-0.03519594,  0.0537153 , -0.02790581, ..., -0.0071175 ,
        -0.0135484 ,  0.02308535],
       [-0.03519594,  0.0537153 , -0.02790581, ..., -0.0071175 ,
        -0.0135484 ,  0.02308535],
       ...,
       [-0.00806239,  0.03040993, -0.048115  , ..., -0.0016513 ,
        -0.01623113,  0.00687234],
       [-0.02088596,  0.01465792, -0.0291551 , ...,  0.03885469,
        -0.05025212, -0.00095249],
       [-0.03519592,  0.05371533, -0.02790579, ..., -0.0071175 ,
        -0.01354843,  0.02308529]], dtype=float32)

In [20]:
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)
sentence_1 = ["Before discussing this approach, let us first understand in brief what yield is."]
sentence_2 = ["The former is used inside a function. Before discussing this approach, looking at differences, let us first understand in brief what yield is."]
sentence_3 = ["The former is used inside a function. Before discussing this approach, let us first recognize the differences."]

embeddings_1 = model.encode(sentence_1)
embeddings_2 = model.encode(sentence_2)
embeddings_3 = model.encode(sentence_3)
similarity = embeddings_1["dense_vecs"] @ embeddings_2["dense_vecs"].T
print(similarity) # 0.884371
similarity = embeddings_1["dense_vecs"] @ embeddings_3["dense_vecs"].T
print(similarity) # 0.9100454


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

[[0.83695066]]
[[0.61087024]]


In [27]:
df1 = pd.DataFrame(embeddings)
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.059661,0.018450,-0.039758,-0.042060,-0.044216,-0.023241,-0.027259,0.020452,0.009933,-0.040778,...,-0.046408,-0.009997,0.039415,0.026511,0.015673,0.006259,0.071344,0.019780,-0.042125,0.038383
1,-0.035196,0.053715,-0.027906,-0.034745,-0.027877,0.007694,0.030240,0.019474,-0.003791,-0.023077,...,0.019346,-0.007953,-0.012474,-0.017535,0.005117,-0.003466,-0.002747,-0.007117,-0.013548,0.023085
2,-0.035196,0.053715,-0.027906,-0.034745,-0.027877,0.007694,0.030240,0.019474,-0.003791,-0.023077,...,0.019346,-0.007953,-0.012474,-0.017535,0.005117,-0.003466,-0.002747,-0.007117,-0.013548,0.023085
3,-0.017894,0.003590,-0.025652,-0.017793,-0.038787,-0.001943,-0.005159,-0.006457,-0.000102,-0.027378,...,0.028466,-0.019472,0.076911,-0.016450,0.009858,-0.015103,0.005643,-0.005126,0.005123,0.017724
4,-0.035196,0.053715,-0.027906,-0.034745,-0.027877,0.007694,0.030240,0.019474,-0.003791,-0.023077,...,0.019346,-0.007953,-0.012474,-0.017535,0.005117,-0.003466,-0.002747,-0.007117,-0.013548,0.023085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.035196,0.053715,-0.027906,-0.034745,-0.027877,0.007694,0.030240,0.019474,-0.003791,-0.023077,...,0.019346,-0.007953,-0.012474,-0.017535,0.005117,-0.003466,-0.002747,-0.007117,-0.013548,0.023085
9996,0.009110,0.017185,-0.054455,-0.019663,-0.018070,-0.056120,0.045075,-0.065330,-0.018213,-0.047228,...,0.009930,-0.037306,0.032301,0.038416,-0.035205,0.022948,0.050371,-0.011738,-0.034052,0.026258
9997,-0.035196,0.053715,-0.027906,-0.034745,-0.027877,0.007694,0.030240,0.019474,-0.003791,-0.023077,...,0.019346,-0.007952,-0.012474,-0.017535,0.005117,-0.003466,-0.002747,-0.007117,-0.013548,0.023085
9998,-0.002061,0.022459,-0.043841,0.016776,-0.048276,-0.044551,0.012426,0.045399,0.004209,-0.018274,...,-0.002792,0.001135,-0.072303,-0.001034,-0.036395,0.012274,0.027492,0.010574,-0.050224,-0.019702


In [35]:
from pyspark.sql.functions import udf, monotonically_increasing_id
from pyspark.sql.functions import col, lit, explode, posexplode, row_number, expr, collect_list
from pyspark.sql.types import ArrayType, DoubleType, StringType
from pyspark.sql.window import Window
from datetime import datetime

#model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("vonjack/bge-m3-gguf", device="cpu")

model_broadcast = spark.sparkContext.broadcast(model)

def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return float(cos_sim)

def embedding(text):
    model = model_broadcast.value
    dense_vecs = model.encode(text, return_dense=True, max_length=4096)["dense_vecs"]
    return [float(x) for x in dense_vecs]

embedding_udf = udf(embedding, ArrayType(DoubleType()))
cosinesim_udf = udf(cosine_similarity, DoubleType())

start=datetime.today()

dfx = spark.createDataFrame([(string,) for string in s], ["text"])
dfx = dfx.withColumn("rid", monotonically_increasing_id())
df = dfx.withColumn("vec", embedding_udf("text")).drop('text')

df.cache()

df.show()

print(f'\n\n{datetime.today()} - elapsed {datetime.today()-start}\n')


No sentence-transformers model found with name vonjack/bge-m3-gguf. Creating a new one with mean pooling.


OSError: vonjack/bge-m3-gguf does not appear to have a file named config.json. Checkout 'https://huggingface.co/vonjack/bge-m3-gguf/tree/main' for available files.

In [6]:
start=datetime.today()

df2 = df.withColumnRenamed("rid", "rid2").withColumnRenamed("vec", "vec2")

df3 = df.join(df2, df.rid < df2.rid2).withColumn("distance", 1.0 - cosinesim_udf(df.vec, df2.vec2)) \
        .select(col("rid").alias('rid1'), "rid2", "distance")

df3.show(truncate=False)

print(f'\n\n{datetime.today()} - elapsed {datetime.today()-start}\n')


+----+--------------------+
|rid2|                vec2|
+----+--------------------+
|   0|[-0.0596612505614...|
|   1|[-0.0351958908140...|
|   2|[-0.0351958908140...|
|   3|[-0.0178942587226...|
|   4|[-0.0351958908140...|
|   5|[0.00349273602478...|
|   6|[0.00826073810458...|
|   7|[-0.0064715268090...|
|   8|[-0.0351958908140...|
|   9|[-0.0351958908140...|
|  10|[-0.0351958908140...|
|  11|[-0.0351958908140...|
|  12|[-0.0351958908140...|
|  13|[-0.0351958908140...|
|  14|[-0.0109594613313...|
|  15|[0.00754652032628...|
|  16|[0.00774556165561...|
|  17|[-0.0456585250794...|
|  18|[-0.0351958908140...|
|  19|[0.03130273148417...|
+----+--------------------+
only showing top 20 rows

+----+----+-------------------+
|rid1|rid2|distance           |
+----+----+-------------------+
|0   |1   |0.6610146266968993 |
|0   |2   |0.6610146266968993 |
|0   |3   |0.5549171952369145 |
|0   |4   |0.6610146266968993 |
|0   |5   |0.4842509179948584 |
|0   |6   |0.45500587648684154|
|0   |7   |0.4

In [25]:
#df3.filter(df5.rid+2 == df5.rid2).show(truncate=False)
#df3.filter(df3.distance<0.3).show(truncate=False)
dfx.count()

10000

In [31]:
dfx.join(df3, (df3.rid1==0) & (df3.rid2==dfx.rid) & (df3.distance<0.3)).select('rid','text').show(10,truncate=False)

import genieclust
vecs = df.select('vec').rdd.map(lambda row: row['vec']).collect()
vecs_array = np.array(vecs)

k = genieclust.Genie(n_clusters=500, cast_float32=True, gini_threshold=0.2, affinity="cosinesimil", \
                       exact=False, compute_all_cuts=True, compute_full_tree=True) \
          .fit_predict(vecs_array)
print(k.shape)
k

24/07/20 19:50:54 WARN TaskSetManager: Stage 89 contains a task of very large size (1358 KiB). The maximum recommended task size is 1000 KiB.


+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|rid |text                                                                                                                                                                                                                                                                                                                                                                                 |
+----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

(501, 10000)


array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   1,   1, ...,   1,   0,   1],
       ...,
       [  0,   1,   1, ...,   1, 353,   1],
       [  0,   1,   1, ...,   1, 353,   1],
       [  0,   1,   1, ...,   1, 353,   1]])

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType, ArrayType

# Convert distance matrix to DataFrame format suitable for Spark MLlib
def create_distance_matrix_df(distance_matrix):
    num_rows = len(distance_matrix)
    rows = []
    for i in range(num_rows):
        row = Vectors.dense(distance_matrix[i])
        rows.append((i, row))
    return spark.createDataFrame(rows, ["id", "features"])

# Convert the distance matrix into a DataFrame
distance_df = create_distance_matrix_df(distance_matrix)

# Apply KMeans clustering
kmeans = KMeans(k=5, seed=1)  # Adjust `k` as needed
model = kmeans.fit(distance_df)
predictions = model.transform(distance_df)

# Show cluster centers and assignments
print("Cluster Centers:")
centers = model.clusterCenters()
for center in centers:
    print(center)

print("Cluster Assignments:")
predictions.show()


In [53]:
import genieclust

# n_clusters=500
embeddings1, embeddings2 = split(enbeddings) # how to split embeddings in 2
k1 = genieclust.Genie(n_clusters=500, cast_float32=True, gini_threshold=0.2, affinity="cosinesimil", \
                       exact=False, compute_all_cuts=True, compute_full_tree=True) \
          .fit_predict(embeddings1)
k2 = genieclust.Genie(n_clusters=500, cast_float32=True, gini_threshold=0.2, affinity="cosinesimil", \
                       exact=False, compute_all_cuts=True, compute_full_tree=True) \
          .fit_predict(embeddings2)
k = merge(k1, k2)




(501, 10000)
(10000, 1024)


array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   1,   1, ...,   1,   0,   1],
       ...,
       [  0,   1,   1, ...,   1, 164,   1],
       [  0,   1,   1, ...,   1, 164,   1],
       [  0,   1,   1, ...,   1, 164,   1]])

In [9]:
import genieclust
k = genieclust.Genie(n_clusters=10, cast_float32=True, gini_threshold=0.2, affinity="cosinesimil", \
                       exact=False, compute_all_cuts=True, compute_full_tree=True) \
          .fit_predict(embeddings)
print(k.shape)
print(embeddings.shape)
k

(11, 40)
(40, 1024)


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1],
       [0, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 0, 2, 0, 1, 0, 1, 0,
        0, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 0, 1],
       [0, 1, 1, 2, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 0, 3, 0, 1, 0, 1, 0,
        0, 3, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 0, 1],
       [0, 1, 1, 2, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 4, 3, 0, 1, 0, 1, 0,
        0, 3, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 4, 1],
       [0, 1, 1, 2, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 4, 3, 0, 1, 0, 1, 0,
        0, 3, 1, 2, 5, 3, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 4, 1],
       [0, 1, 1, 2, 1, 3, 3, 3, 1, 1, 1, 

In [52]:
h = k[[20, 50, 100, 200, 500], :]
print(h.shape)
h = np.transpose(h)
print(h.shape)
h = pd.DataFrame(h, columns=['k20', 'k50', 'k100', 'k200', 'k500'])
#h = h.drop_duplicates() # why ?
h

(5, 10000)
(10000, 5)


Unnamed: 0,k20,k50,k100,k200,k500
0,0,0,0,0,0
1,1,1,1,1,1
2,1,1,1,1,1
3,2,2,2,2,2
4,1,1,1,1,1
...,...,...,...,...,...
9995,1,1,1,1,1
9996,14,21,48,57,68
9997,1,1,1,1,1
9998,3,5,79,117,360


In [34]:
#dx2['k20'] = k20[50, :]
dx2['k2'] = k20[5, :]

s20 = dx2.groupby('k2').apply(lambda x: x.sample(n=7, replace=False) if len(x) >= 7 else x).reset_index(drop=True)
grouped_df = s20.groupby('k2')['text'].apply(' . '.join).reset_index()
grouped_df = grouped_df.rename(columns={'text': 'content'})
grouped_df

Unnamed: 0,k2,content
0,0,Pour utiliser des listes dans un dictionnaire ...
1,1,NMSLIB started as a personal project of Bilegs...
2,2,pated in earlier evaluations. The most success...
3,3,lude a modification of the VP-tree due to Boyt...
4,4,"z et al. (2013) and improved by David Novak, a..."


In [None]:
def summarize(df):
    from llama_cpp import Llama
    llmc = Llama.from_pretrained(
        repo_id="failspy/Phi-3-mini-128k-instruct-abliterated-v3-GGUF",
        filename="Phi-3-mini-128k-instruct-abliterated-v3_q4.gguf",
        verbose=False, 
        n_ctx=4096,
        #n_gpu_layers=-1, # move all to GPU
        n_gpu_layers=0, # use CPU
    )
    content = df.iloc[0]['text']
    prompt = f"""<|system|>You are a helpful assistant.<|end|>
               <|user|>Summarize in french the following list of sentences in a short paragraph. Here is the list of sentence to summarize :{content}<|end|>
               <|assistant|>"""
    output = llmc(prompt, max_tokens=2048, stop=["<|endoftext|>"])
                # max_tokens=-1, echo=False, temperature=0.2, top_p=0.1)
    return pd.DataFrame({'summary': [output['choices'][0]['text']]})

# create summaries via Spark
summaries = (df
                .limit(1)
                .groupby('content')
                .applyInPandas(summarize, schema='summary string')
                .show(vertical=True, truncate=False)
            )

In [39]:
from llama_cpp import Llama
from tqdm import tqdm

llm = Llama.from_pretrained(
    repo_id="failspy/Phi-3-mini-128k-instruct-abliterated-v3-GGUF",
    filename="Phi-3-mini-128k-instruct-abliterated-v3_q4.gguf",
    verbose=False, 
    n_ctx=4096,
    n_gpu_layers=-1,
)

def process_content(content):
    prompt = f"""<|system|>You are a helpful assistant.<|end|>
               <|user|>Summarize in french the following list of sentences in a short paragraph. Here is the list of sentence to summarize :{content}<|end|>
               <|assistant|>"""

    output = llm(prompt, max_tokens=2048, stop=["<|endoftext|>"])
    res = output["choices"][0]["text"].strip()
    return res

results = []
for content in tqdm(grouped_df['content'], desc="Summarizing"):  
    summary = process_content(content)
    results.append(summary)

dz = pd.DataFrame(results, columns=['Description'])

dz['k5'] = dz.index

m1 = pd.merge(h, dz, on='k5')
m1 = m1[['k2','k5','Description']]
m1 = m1.drop_duplicates()

gdf1 = m1.groupby('k2')['Description'].apply(' . '.join).reset_index()
gdf1 = gdf1.rename(columns={'text': 'content'})
gdf1


Summarizing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:26<00:00,  5.34s/it]


Unnamed: 0,k2,Description
0,0,Pour créer un DataFrame pandas à partir d'un d...
1,1,Il semble que nous ayons un texte en anglais q...


In [44]:

spark.createDataFrame(gdf1).show(truncate=False)


+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:

results = []
for content in tqdm(gdf1['Description'], desc="Summarizing"):  
    summary = process_content(content)
    results.append(summary)

dz2 = pd.DataFrame(results, columns=['Description'])
dz.to_csv('dz1.csv', index=False)
dz2.to_csv('dz2.csv', index=False)
h.to_csv('h.csv', index=False)

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('/home/me/Téléchargements/laicite_gd.csv', delimiter=";")
df['source'] = df['screen name']
dx = df[['text','source']]
dx2 = dx.dropna()

def split_string(string, max_length=4096):
    return [string[i:i + max_length] for i in range(0, len(string), max_length)]

dx2['text'] = dx2['text'].apply(split_string)
dx2 = dx2.explode('text').reset_index(drop=True)

sent = dx2['text'].tolist()
s = [str(x) for x in sent]


In [1]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
df['source'] = df['screen name']
dx = df[['text','source']]
dx2 = dx.dropna()

def split_string(string, max_length=4096):
    return [string[i:i + max_length] for i in range(0, len(string), max_length)]

dx2['text'] = dx2['text'].apply(split_string)
dx2 = dx2.explode('text').reset_index(drop=True)
sent = dx2['text'].tolist()
s = [str(x) for x in sent]

embeddings = model.encode(s, return_dense=True,max_length=4096)
embeddings = embeddings["dense_vecs"]
df1 = pd.DataFrame(embeddings)



ModuleNotFoundError: No module named 'FlagEmbedding'

In [2]:
import genieclust

k20 = genieclust.Genie(n_clusters=500, cast_float32=True, gini_threshold=0.2, affinity="cosinesimil", exact=False, compute_all_cuts=True, compute_full_tree=True).fit_predict(embeddings)


h = k20[[20, 50, 100, 200, 500], :]
h = np.transpose(h)
h = pd.DataFrame(h, columns=['k20', 'k50', 'k100', 'k200', 'k500'])
h = h.drop_duplicates()


dx2['k20'] = k20[50, :]

s20 = dx2.groupby('k20').apply(lambda x: x.sample(n=7, replace=False) if len(x) >= 7 else x).reset_index(drop=True)
grouped_df = s20.groupby('k20')['text'].apply(' . '.join).reset_index()
grouped_df = grouped_df.rename(columns={'text': 'content'})



ModuleNotFoundError: No module named 'genieclust'

In [None]:
from llama_cpp import Llama
from tqdm import tqdm

llm = Llama.from_pretrained(
    repo_id="failspy/Phi-3-mini-128k-instruct-abliterated-v3-GGUF",
    filename="Phi-3-mini-128k-instruct-abliterated-v3_q4.gguf",
    verbose=False, 
    n_ctx=4096,
    n_gpu_layers=-1,
)

def process_content(content):
    prompt = f"""<|system|>You are a helpful assistant.<|end|>
               <|user|>Summarize in french the following list of sentences in a short paragraph. Here is the list of sentence to summarize :{content}<|end|>
               <|assistant|>"""

    output = llm(prompt, max_tokens=2048, stop=["<|endoftext|>"])
    res = output["choices"][0]["text"].strip()
    return res

results = []
for content in tqdm(grouped_df['content'], desc="Summarizing"):  
    summary = process_content(content)
    results.append(summary)

dz = pd.DataFrame(results, columns=['Description'])

dz['k50'] = dz.index

m1 = pd.merge(h, dz, on='k50')
m1 = m1[['k20','k50','Description']]
m1 = m1.drop_duplicates()

gdf1 = m1.groupby('k20')['Description'].apply(' . '.join).reset_index()
gdf1 = gdf1.rename(columns={'text': 'content'})


results = []
for content in tqdm(gdf1['Description'], desc="Summarizing"):  
    summary = process_content(content)
    results.append(summary)

dz2 = pd.DataFrame(results, columns=['Description'])



dz.to_csv('dz1.csv', index=False)
dz2.to_csv('dz2.csv', index=False)
h.to_csv('h.csv', index=False)