In [1]:
!pip install -q bertopic

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 21.10.1 requires cupy-cuda114, which is not installed.
dask-cudf 21.10.1 requires dask==2021.09.1, but you have dask 2022.2.0 which is incompatible.
dask-cudf 21.10.1 requires distributed==2021.09.1, but you have distributed 2022.2.0 which is incompatible.[0m


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import operator
from tqdm import tqdm
tqdm.pandas()

import re
import nltk
from nltk import tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

from bertopic import BERTopic

stopwords = nltk.corpus.stopwords.words('english') + [chr(i) for i in range(ord('a'), ord('z') + 1)]

In [3]:
LEMMATIZER = WordNetLemmatizer()

def get_cleaned_text(text):
    """basic cleaning
        - lower case
        - keep alpha only
        - remove stop words
        - lemmatize?
    """
    text = text.lower()
    text = re.sub("[^a-z]", " ", text)
    text = ' '.join([word for word in text.split() if word not in stopwords])
    text = ' '.join([LEMMATIZER.lemmatize(token) for token in tokenize.word_tokenize(text)])
    
    return text

In [4]:
df = pd.read_csv("../input/nips-papers/papers.csv") # .sample(1000).reset_index(drop=True)

df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [5]:
# clean text col
df['paper_text'] = df['paper_text'].progress_apply(get_cleaned_text)

100%|██████████| 7241/7241 [05:33<00:00, 21.72it/s]


In [6]:
model = BERTopic(
    nr_topics=10, # 'auto'
    calculate_probabilities=True,
    n_gram_range=(1, 3),
    # top_n_words=15,
    verbose=False,
);

topics, probs = model.fit_transform(df['paper_text'].tolist())

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
topic_freq = model.get_topic_freq().query('Topic != -1').reset_index(drop=True)
topic_freq

Unnamed: 0,Topic,Count
0,0,424
1,1,390
2,2,365
3,3,198
4,4,151
5,5,139
6,6,130
7,7,122
8,8,102
9,9,100


In [8]:
for idx, row in topic_freq.iterrows():
    print(
        f'Topic {idx}:', 
        '-'.join([x[0] for x in model.get_topic(row.Topic)])
    )

Topic 0: image-object-model-feature-using-set-network-training-visual-learning
Topic 1: policy-state-action-learning-function-reward-agent-value-algorithm-game
Topic 2: network-input-weight-neural-function-unit-output-learning-neural network-time
Topic 3: regret-algorithm-bandit-arm-bound-problem-online-log-learning-loss
Topic 4: speech-speaker-signal-recognition-model-network-system-source-training-using
Topic 5: cell-model-neuron-response-visual-stimulus-motion-input-orientation-eye
Topic 6: convex-gradient-algorithm-method-optimization-convergence-function-stochastic-problem-xk
Topic 7: clustering-cluster-algorithm-data-mean-point-matrix-set-problem-number
Topic 8: kernel-learning-method-function-data-space-problem-set-algorithm-xi
Topic 9: matrix-tensor-rank-norm-algorithm-completion-problem-matrix completion-entry-low


In [9]:
# model.visualize_distribution(probs[0])

In [10]:
def infer(model, text):
    
    text = get_cleaned_text(text)
    
    topics, scores = model.find_topics(text)
    ctr = 1
    
    for (topic, score) in zip(topics, scores):
        if topic != -1:
            print(
                f'{ctr}:', 
                ' - '.join(map(lambda x: x[0], model.get_topic(topic))), 
                f'Score: {score}'
            )
            ctr += 1

In [11]:
test_text = """Toxic content is one of the most critical issues for social media platforms today. 
India alone had 518 million social media users in 2020. In order to provide a good experience to content creators and 
their audience, it is crucial to flag toxic comments and the users who post that. But the big challenge is identifying t
oxicity in low resource Indic languages because of the presence of multiple representations of the same text. 
Moreover, the posts/comments on social media do not adhere to a particular format, grammar or sentence structure; 
this makes the task of abuse detection even more challenging for multilingual social media platforms. 
This paper describes the system proposed by team 'Moj Masti' using the data provided by ShareChat/Moj in 
\emph{IIIT-D Multilingual Abusive Comment Identification} challenge. We focus on how we can leverage multilingual 
transformer based pre-trained and fine-tuned models to approach code-mixed/code-switched classification tasks. 
Our best performing system was an ensemble of XLM-RoBERTa and MuRIL which achieved a Mean F-1 score of 0.9 on the 
test data/leaderboard. We also observed an increase in the performance by adding transliterated data. 
Furthermore, using weak metadata, ensembling and some post-processing techniques boosted the performance of our system, 
thereby placing us 1st on the leaderboard."""

print(test_text)

Toxic content is one of the most critical issues for social media platforms today. 
India alone had 518 million social media users in 2020. In order to provide a good experience to content creators and 
their audience, it is crucial to flag toxic comments and the users who post that. But the big challenge is identifying t
oxicity in low resource Indic languages because of the presence of multiple representations of the same text. 
Moreover, the posts/comments on social media do not adhere to a particular format, grammar or sentence structure; 
this makes the task of abuse detection even more challenging for multilingual social media platforms. 
This paper describes the system proposed by team 'Moj Masti' using the data provided by ShareChat/Moj in 
\emph{IIIT-D Multilingual Abusive Comment Identification} challenge. We focus on how we can leverage multilingual 
transformer based pre-trained and fine-tuned models to approach code-mixed/code-switched classification tasks. 
Our best perfo

In [12]:
infer(model, test_text)

1: clustering - cluster - algorithm - data - mean - point - matrix - set - problem - number Score: 0.2621508972226899
2: kernel - learning - method - function - data - space - problem - set - algorithm - xi Score: 0.2459459449369162
3: speech - speaker - signal - recognition - model - network - system - source - training - using Score: 0.2405352668993583
4: network - input - weight - neural - function - unit - output - learning - neural network - time Score: 0.2135627531809199
