In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ayon/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [35]:
import os
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

# NLTK
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Spacy
import spacy

def make_doc_list(root='teext'):
    data_folder = os.path.join(os.getcwd(), root)
    filepaths = []
    docs = []

    for file in os.listdir(root):
        filepaths.append(os.path.join(data_folder, file))
    
    for filepath in filepaths:
        with open(filepath, 'r', encoding='utf-8') as f:
            sentence = f.read()
            docs.append(gensim.utils.simple_preprocess(sentence, deacc=True))
    
    return docs

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [36]:
docs = make_doc_list()
data_words_nostops = remove_stopwords(docs)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
docs = []

for sentence in data_lemmatized:
    docs.append(" ".join(word for word in sentence))

print(type(docs))

<class 'list'>


In [37]:
print(docs[0])

cooperation shah launch world liquid nano fertiliser contain nitrogen phosphorus reduce fertiliser import dependence also take country sustainable farming increase farmer income bring input cost second variant fertiliser first series launch bottle replace bag traditional ammonium phosphate manufacturing unit establish kalol gujarat paradeep variant liquid fertiliser launch fertiliser commercial sale rs bottle less current price conventional shah say liquid dap spray plant help increase quality quantity production also help conserve soil farmer use liquid liquid increase number earthworm land thus move natural farming reduce production income add contribute lot restore fertility land reduce threat health crore cause chemical fertiliser say minister exhort farmer maximise liquid crore estimate production crore bottle dap replace lakh tonne conventional import lakh tonne lakh tonne dap lakh tonne mop muriate potash lakh tonne npk fertiliser shah note application variant help reduce usage 

In [38]:
topic_model = BERTopic()
# docs = make_doc_list()

topic, probs = topic_model.fit_transform(docs)

print(topic_model.get_topic_info())

   Topic  Count                              Name
0     -1     52            -1_say_joint_also_year
1      0    113      0_say_court_party_government
2      1     37     1_inflation_price_growth_year
3      2     34            2_get_battery_also_new
4      3     25  3_record_temperature_case_degree
5      4     22         4_say_minister_visit_shah
6      5     21         5_marriage_say_rule_right
7      6     16              6_run_ball_team_game
8      7     15     7_indian_sudan_evacuation_say
9      8     15    8_company_business_work_future


In [41]:
topic_model.get_topic(0)

[('say', 0.04650322257491916),
 ('court', 0.03523897343823865),
 ('party', 0.02867961785451503),
 ('government', 0.025850441663979908),
 ('police', 0.02440162370560566),
 ('leader', 0.023958886605963375),
 ('case', 0.023340471672549688),
 ('state', 0.023236106258236702),
 ('people', 0.022007886877554495),
 ('judge', 0.020565828751426762)]

In [40]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,cooperation shah launch world liquid nano fert...,-1,-1_say_joint_also_year,say - joint - also - year - wrestler - collage...,0.000000,False
1,convict godhra train burning case grant bail s...,0,0_say_court_party_government,say - court - party - government - police - le...,0.874143,False
2,unit assert grand old party form government po...,0,0_say_court_party_government,say - court - party - government - police - le...,1.000000,False
3,disagreement republican raise government tn de...,1,1_inflation_price_growth_year,inflation - price - growth - year - month - ma...,0.808603,False
4,garage set unveil second fully electric offeri...,2,2_get_battery_also_new,get - battery - also - new - launch - electric...,1.000000,False
...,...,...,...,...,...,...
345,traffic jam common problem face busy urban are...,3,3_record_temperature_case_degree,record - temperature - case - degree - covid -...,0.431339,False
346,follow leader pawar join rule bjp coalition ma...,0,0_say_court_party_government,say - court - party - government - police - le...,1.000000,False
347,boost ongoing repatriation effort operation in...,7,7_indian_sudan_evacuation_say,indian - sudan - evacuation - say - citizen - ...,1.000000,False
348,leader central constituency say take criticism...,0,0_say_court_party_government,say - court - party - government - police - le...,1.000000,False


In [45]:
import json
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(topic_model.get_topics(), f, ensure_ascii=False, indent=2)