In [1]:
import nltk

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('../input/av-hacks/train.csv')
data.head()

Unnamed: 0,unique_hash,drug,sentiment,text
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,gilenya,2,autoimmune diseases tend to come in clusters. ...
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,gilenya,2,i can completely understand why you would want...
2,fe809672251f6bd0d986e00380f48d047c7e7b76,fingolimod,2,interesting that it only targets s1p - 1 / 5 r...
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,ocrevus,2,"very interesting , grand merci. now i wonder ..."
4,b227688381f9b25e5b65109dd00f7f895e838249,gilenya,1,"hi everybody , my latest mri results for brai..."


In [3]:
!pip install cucco

Collecting cucco
[?25l  Downloading https://files.pythonhosted.org/packages/76/a4/885b87757c0dabb87b6fa7f7ea214f33c03d6159aa4bcd7d7024f07839e5/cucco-2.2.1.tar.gz (51kB)
[K     |████████████████████████████████| 51kB 3.9MB/s 
[?25hBuilding wheels for collected packages: cucco
  Building wheel for cucco (setup.py) ... [?25l- \ done
[?25h  Stored in directory: /tmp/.cache/pip/wheels/1b/9f/88/4601c19912235677fd8fa8a2958cae82dcc63a1d8672633e29
Successfully built cucco
Installing collected packages: cucco
Successfully installed cucco-2.2.1


In [4]:
import cucco
from cucco import Cucco

norm_en = Cucco()

def normalise(row):
    ''' Performs text normalisation for multiple languages. Removes stopwords,punctuation etc.'''
    
    text = row['text']
    sents = nltk.sent_tokenize(text)
    rules = ['remove_stop_words', 'replace_punctuation', 'remove_extra_whitespaces']
    norm_text = ' '.join([norm_en.normalize(sent,rules) for sent in sents])
    
    return norm_text

In [5]:

data['text'] = data.apply(normalise,axis=1)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy

In [7]:
nlp = spacy.load('en_core_web_sm')
cv = CountVectorizer(max_df=0.95,min_df=2,stop_words='english')
term_matrix = cv.fit_transform(data['text'])
print(term_matrix)

  (0, 21903)	1
  (0, 15631)	1
  (0, 17455)	1
  (0, 7596)	1
  (0, 6165)	1
  (0, 11201)	1
  (0, 6156)	1
  (0, 22099)	1
  (0, 13421)	1
  (0, 21407)	1
  (0, 5100)	1
  (0, 6890)	1
  (0, 19708)	1
  (0, 2285)	1
  (0, 20755)	1
  (0, 19795)	1
  (0, 7568)	1
  (0, 20226)	1
  (0, 21599)	1
  (0, 4530)	1
  (0, 9258)	1
  (0, 8273)	2
  (0, 9122)	1
  (0, 4962)	1
  (0, 19969)	1
  :	:
  (5278, 6130)	1
  (5278, 13154)	1
  (5278, 15994)	1
  (5278, 20112)	1
  (5278, 13985)	1
  (5278, 9581)	1
  (5278, 17519)	1
  (5278, 14516)	1
  (5278, 13153)	1
  (5278, 5583)	1
  (5278, 14247)	1
  (5278, 6039)	1
  (5278, 16149)	1
  (5278, 13510)	2
  (5278, 19219)	1
  (5278, 22103)	1
  (5278, 9678)	1
  (5278, 17005)	1
  (5278, 14365)	1
  (5278, 15186)	1
  (5278, 4827)	1
  (5278, 7088)	2
  (5278, 19795)	1
  (5278, 9258)	1
  (5278, 19969)	1


In [8]:
lda = LatentDirichletAllocation(n_components=5)
lda.fit(term_matrix)

len(lda.components_)
print(lda.components_.shape)

(5, 22233)


In [9]:
print(lda.components_)

[[ 38.46065252 167.64559662   1.36158083 ...   0.20000001   2.85466912
    0.20041186]
 [ 10.31586945  60.71878136   0.2000006  ...   0.20000002   0.20055915
    0.20000015]
 [ 33.33358364  82.69113299   0.20000055 ...   0.20000002   0.20000033
    0.20000014]
 [  0.20295818   6.77537267   0.20000054 ...   0.20101518   0.20000032
    0.20000014]
 [ 19.68693621  59.16911636   1.03841749 ...   2.19898478   3.54477108
    3.19958771]]


In [10]:

print(len(lda.components_[0]))

22233


In [11]:
topic = lda.components_[0]
top_words_indices = topic.argsort()[-10:]
for index in top_words_indices:
    print(cv.get_feature_names()[index])

health
sclerosis
people
drug
multiple
study
disease
treatment
patients
ms


In [12]:
topic_word_dict = {}
for index,topic in enumerate(lda.components_):
    words = [cv.get_feature_names()[i] for i in topic.argsort()[-10:]]
    topic_word_dict[index] = words
    print('Top words for topic {}'.format(index))
    print(words)
    print('-'*120)

Top words for topic 0
['health', 'sclerosis', 'people', 'drug', 'multiple', 'study', 'disease', 'treatment', 'patients', 'ms']
------------------------------------------------------------------------------------------------------------------------
Top words for topic 1
['months', 'brain', 'egfr', 'tarceva', 'nsclc', 'chemo', 'stage', 'treatment', 'lung', 'cancer']
------------------------------------------------------------------------------------------------------------------------
Top words for topic 2
['ocrevus', 'people', 'things', 'work', 'life', 'feel', 'years', 'day', 'good', 'time']
------------------------------------------------------------------------------------------------------------------------
Top words for topic 3
['dose', 'disease', 'symptoms', 'crohn', 'blood', 'remicade', 'humira', 'pain', 'effects', 'doctor']
------------------------------------------------------------------------------------------------------------------------
Top words for topic 4
['tumor', 'stud

In [13]:
topics = lda.transform(term_matrix)
data['topic'] = topics.argmax(axis=1)

In [14]:
def assign_topics(row):
    topic = row['topic']
    words = topic_word_dict[topic]

    return words

In [15]:
data['topic words'] = data.apply(assign_topics,axis=1)
print(data.head())

                                unique_hash                        ...                                                                topic words
0  2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0                        ...                          [ocrevus, people, things, work, life, feel, ye...
1  9eba8f80e7e20f3a2f48685530748fbfa95943e4                        ...                          [health, sclerosis, people, drug, multiple, st...
2  fe809672251f6bd0d986e00380f48d047c7e7b76                        ...                          [tumor, study, cell, clinical, disease, cells,...
3  bd22104dfa9ec80db4099523e03fae7a52735eb6                        ...                          [health, sclerosis, people, drug, multiple, st...
4  b227688381f9b25e5b65109dd00f7f895e838249                        ...                          [health, sclerosis, people, drug, multiple, st...

[5 rows x 6 columns]


In [16]:
data.head()

Unnamed: 0,unique_hash,drug,sentiment,text,topic,topic words
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,gilenya,2,autoimmune diseases tend clusters gilenya f...,2,"[ocrevus, people, things, work, life, feel, ye..."
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,gilenya,2,completely understand it results reported le...,0,"[health, sclerosis, people, drug, multiple, st..."
2,fe809672251f6bd0d986e00380f48d047c7e7b76,fingolimod,2,interesting targets s1p 1 5 receptors 1 5 f...,4,"[tumor, study, cell, clinical, disease, cells,..."
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,ocrevus,2,interesting grand merci lemtrada ocrevus sal...,0,"[health, sclerosis, people, drug, multiple, st..."
4,b227688381f9b25e5b65109dd00f7f895e838249,gilenya,1,latest mri results brain cervical cord neuro...,0,"[health, sclerosis, people, drug, multiple, st..."


In [17]:
print(data['text'][4])
print('-'*120)
print(data['topic'][4])
print('-'*120)
print(topic_word_dict[data['topic'][4]])
print('-'*120)

  latest mri results brain cervical cord neurologist appointment couple weeks  lesions brain  cord relapses gilenya good sign line cervical cord review concerned me      lesions c2  3 t2 show hypointensity post gadolinium t1 images only represent artifact early axonal loss  bothersome read kind symptoms c2  c3 lesion aware   result change dmt   thanks
------------------------------------------------------------------------------------------------------------------------
0
------------------------------------------------------------------------------------------------------------------------
['health', 'sclerosis', 'people', 'drug', 'multiple', 'study', 'disease', 'treatment', 'patients', 'ms']
------------------------------------------------------------------------------------------------------------------------


In [18]:
!pip install bert-serving-client

Collecting bert-serving-client
  Downloading https://files.pythonhosted.org/packages/b5/2f/dd50af5b8dbde79e69f4bd2edec222eaa23d1015b03e9613411b78f9a639/bert_serving_client-1.9.6-py2.py3-none-any.whl
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.9.6


In [19]:
import pickle
import numpy as np
import pandas as pd
from langdetect import detect,detect_langs
from nltk import sent_tokenize
from bert_serving.client import BertClient
from sklearn.cluster import KMeans

In [20]:
!pip install bert-serving-server

Collecting bert-serving-server
[?25l  Downloading https://files.pythonhosted.org/packages/af/ef/0df9a6ce54a02d0a891d25af60e49b3a3a64d425e80c28acfee97f5ab0f2/bert_serving_server-1.9.6-py3-none-any.whl (61kB)
[K     |████████████████████████████████| 61kB 5.1MB/s 
Collecting GPUtil>=1.3.0 (from bert-serving-server)
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l- \ done
[?25h  Stored in directory: /tmp/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built GPUtil
Installing collected packages: GPUtil, bert-serving-server
Successfully installed GPUtil-1.4.0 bert-serving-server-1.9.6


In [21]:
import socket
socket.gethostbyname(socket.gethostname())

'172.19.1.2'

In [22]:
import subprocess

In [23]:
bert_command = 'bert-serving-start -model_dir ../input/pretrained-bert-including-scripts/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12 -num_worker=4'
process = subprocess.Popen(bert_command.split(), stdout=subprocess.PIPE)

In [24]:
bert = BertClient(check_length=False)
def get_embeddings(row):
    '''Generates bert sentence embeddings.Bert server is open in the terminal.'''
    
    text = row['text']
    sents = sent_tokenize(text)
    embeddings = bert.encode(sents)
    return embeddings

data['Embeddings'] = data.apply(get_embeddings,axis=1)

In [25]:
def get_cluster_centers(row):
    ''' 
    Performs clustering of sentences in the text. Number of clusters or the number of required sentences in summary
    is the square root of total sentences in the text.Returns cluster centers.
    '''
    
    text = row['text']
    sents = sent_tokenize(text)
    clusters = int(np.ceil(len(sents)**0.5))
    embeddings = row['Embeddings']
    kmeans = KMeans(n_clusters=clusters).fit(embeddings)
    
    return kmeans.cluster_centers_

data['Cluster Centers'] = data.apply(get_cluster_centers,axis=1)

In [26]:
def get_summary(row):
    '''
    Generates summary by choosing the sentences in the text that are closest to the centroid.
    '''
    text = row['text']
    sents = sent_tokenize(text)
    centroids = row['Cluster Centers']
    embeddings = row['Embeddings']
    clusters = centroids.shape[0]
    sents_len = len(sents)
    summary = []
    for i in range(clusters):
        select = -1
        m = -np.inf
        for j in range(sents_len):
            similarity = np.dot(centroids[i],embeddings[j])
            if similarity > m:
                m = similarity
                select = j
        summary.append(select)
    summary.sort()
    summary = ''.join([sents[i] for i in summary])
    return summary
    
data['Summary'] = data.apply(get_summary,axis=1)

In [27]:
print(data['text'][1])
print('-'*120)
print(data['Summary'][1])

completely understand it   results reported lectures stand scrutiny peer  review publication convincing hope work   do aware happy risks   great important present balanced   understand move straight show promise animal study drugs humans lot animal data gather   human data gather safe effective times animal studies follow humans   major attrition points drug development unpredictability issues cladribine  gilenya   interaction predicted people   doctors patterns work on clemastine  metformin exciting   current condition personal risk tolerance makes sense it everyone
------------------------------------------------------------------------------------------------------------------------
completely understand it   results reported lectures stand scrutiny peer  review publication convincing hope work   do aware happy risks   great important present balanced   understand move straight show promise animal study drugs humans lot animal data gather   human data gather safe effective times ani

In [28]:
data.to_csv("train_summary.csv")