In [1]:
import pandas as pd

In [2]:
import pandas as pd

df = pd.read_csv("whastsapp_chat.csv", header=None, names=["raw"])

In [3]:
df

Unnamed: 0,raw
0,"30/11/20, 6:57?pm - Messages and calls are end..."
1,"30/11/20, 6:57?pm - Mohanraj: <Media omitted>"
2,"04/06/21, 11:00?am - Mohanraj: Hi"
3,"04/06/21, 11:00?am - Mohanraj: Bro sathish na ..."
4,"04/06/21, 12:10?pm - Sathish Lap: Ama pa"
...,...
108,Indha brands lam konjam Nala irukum
109,"14/10/25, 6:24?pm - Sathish Lap: Send pq"
110,"14/10/25, 6:27?pm - Sathish Lap: Yes"
111,"14/10/25, 6:27?pm - Sathish Lap: This one branded"


In [5]:
import re
pattern = r"^(\d{2}\/\d{2}\/\d{2}),\s*(\d{1,2}:\d{2}.*?m)\s*-\s*([^:]+):\s*(.*)"

rows = []

for line in df["raw"]:
    m = re.match(pattern, line, flags=re.IGNORECASE)
    if m:
        date, time, sender, msg = m.groups()
        rows.append([date, time, sender.strip(), msg.strip()])

clean_df = pd.DataFrame(rows, columns=["Date", "Time", "Sender", "Message"])

print(clean_df.head(10))
print("Total messages:", len(clean_df))

       Date      Time       Sender                                Message
0  30/11/20   6:57?pm     Mohanraj                        <Media omitted>
1  04/06/21  11:00?am     Mohanraj                                     Hi
2  04/06/21  11:00?am     Mohanraj                  Bro sathish na nengaa
3  04/06/21  12:10?pm  Sathish Lap                                 Ama pa
4  04/06/21  12:10?pm  Sathish Lap                                  Nenga
5  04/06/21  12:29?pm     Mohanraj               Morning call pannan broo
6  04/06/21  12:56?pm  Sathish Lap                                  Ok ok
7  04/06/21  12:57?pm  Sathish Lap  Coming thuesday ...mrng call pannunga
8  04/06/21  12:57?pm     Mohanraj                                     ??
9  07/06/21  12:58?pm     Mohanraj       Nallaiki varuvingalaa kumbakonam
Total messages: 89


In [7]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     89 non-null     object
 1   Time     89 non-null     object
 2   Sender   89 non-null     object
 3   Message  89 non-null     object
dtypes: object(4)
memory usage: 2.9+ KB


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [10]:
tfidf=TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')
dtm=tfidf.fit_transform(clean_df["Message"])


In [11]:
from sklearn.decomposition import NMF
nmf_model=NMF(n_components=5,random_state=42)
nmf_model.fit(dtm)

In [12]:
for index,topic in enumerate(nmf_model.components_):
    results=([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print(results)

['amd', 'la', 'pa', 'varuven', 'naa', 'kadai', 'open', 'brands', 'media', 'omitted']
['ku', 'solra', 'tha', 'la', 'lap', 'open', 'send', 'kadai', 'evolo', 'bro']
['la', 'lap', 'tha', 'ku', 'solra', 'keyboard', 'model', 'mrng', 'broo', 'ok']
['kadai', 'mrng', 'broo', 'model', 'keyboard', 'open', 'intel', 'amd', 'motherboard', 'rs']
['naa', 'kadai', 'broo', 'mrng', 'model', 'keyboard', 'open', 'sata', '5inch', 'ssd']


In [13]:
topic_results=nmf_model.transform(dtm)

In [16]:
clean_df["Topic"]=topic_results.argmax(axis=1)

In [17]:
clean_df

Unnamed: 0,Date,Time,Sender,Message,Topic
0,30/11/20,6:57?pm,Mohanraj,<Media omitted>,0
1,04/06/21,11:00?am,Mohanraj,Hi,0
2,04/06/21,11:00?am,Mohanraj,Bro sathish na nengaa,1
3,04/06/21,12:10?pm,Sathish Lap,Ama pa,4
4,04/06/21,12:10?pm,Sathish Lap,Nenga,0
...,...,...,...,...,...
84,14/10/25,6:25?pm,Mohanraj,Brands:,1
85,14/10/25,6:24?pm,Sathish Lap,Send pq,1
86,14/10/25,6:27?pm,Sathish Lap,Yes,0
87,14/10/25,6:27?pm,Sathish Lap,This one branded,0
