In [11]:
import pandas as pd
import numpy as np
import re
pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

In [67]:
df = pd.read_csv("202001_preprocessed.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55185 entries, 0 to 55184
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PartitionKey     55185 non-null  int64  
 1   UserHostAddress  55185 non-null  object 
 2   WhatText         55185 non-null  object 
 3   CategoryID       39621 non-null  float64
 4   Postcode         55185 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 2.1+ MB


In [68]:
# remove punctuation
import string
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text

df['WhatText'] = df['WhatText'].str.replace("/", " ")
df['WhatText'] = df['WhatText'].str.replace("_", " ")
df['no_punc']= df['WhatText'].apply(lambda x:remove_punctuation(x))

In [69]:
# word tokenization
import nltk
from nltk.tokenize import word_tokenize
df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['no_punc']), axis=1)

In [71]:
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output
#applying the function
df['no_stopwords']= df['tokenized_sents'].apply(lambda x:remove_stopwords(x))

In [76]:
import gensim
# create N-grams
def make_n_grams(texts):
    bigram = gensim.models.Phrases(texts, min_count = 1, threshold=1)  # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    trigram = gensim.models.Phrases(bigram[texts], threshold=1)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    bigrams_text = [bigram_mod[doc] for doc in texts]
    trigrams_text =  [trigram_mod[bigram_mod[doc]] for doc in bigrams_text]
    return trigrams_text

tokens_reviews = make_n_grams(df["no_stopwords"])
df['n_gram'] = tokens_reviews

In [80]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
df['text_ready'] = df['n_gram'].apply(lambda x:lemmatizer(x))

In [100]:
from gsdmm import MovieGroupProcess
#pip install git+https://github.com/rwalk/gsdmm.git

# Gibbs Samping Dirichlet Multinomial Mixture Model (GSDMM) in Short-Text Clustering
mgp = MovieGroupProcess(K = 122, alpha=0.01, beta=0.01, n_iters=40)
vocab = set(x for text in df["text_ready"] for x in text)
n_terms = len(vocab)
model = mgp.fit(df["text_ready"], n_terms)

In stage 0: transferred 53516 clusters with 122 clusters populated
In stage 1: transferred 46922 clusters with 122 clusters populated
In stage 2: transferred 41425 clusters with 122 clusters populated
In stage 3: transferred 38643 clusters with 122 clusters populated
In stage 4: transferred 36264 clusters with 122 clusters populated
In stage 5: transferred 33352 clusters with 122 clusters populated
In stage 6: transferred 30850 clusters with 122 clusters populated
In stage 7: transferred 29162 clusters with 122 clusters populated
In stage 8: transferred 27750 clusters with 122 clusters populated
In stage 9: transferred 26492 clusters with 122 clusters populated
In stage 10: transferred 25547 clusters with 122 clusters populated
In stage 11: transferred 24823 clusters with 122 clusters populated
In stage 12: transferred 23902 clusters with 122 clusters populated
In stage 13: transferred 23265 clusters with 122 clusters populated
In stage 14: transferred 22712 clusters with 122 clusters 

In [101]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster,sort_dicts))
        
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-10:][::-1]
print('\nMost important clusters (by number of docs inside):', top_index)
# show the top 5 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 10)

Number of documents per topic : [  89   89   57  220   77  132  108   81   50   55   50   52  194  109
  752   43   55 1872  300   86   57   80  197   27   96   75  491  800
  998   49 9806  241   32  117  195  245 1319   85   86 2376 1961   85
   88   71  126  132  128  234   88   68   23 1911 1576  123   68  487
  114  103  113   65   39   75  805  222   15   87   78  116   65   77
   91   64   39  196  106  109   48   56   73 3307  147   68   34   93
  206 7274 2039  591   54   84  159  105   79   75   53   89   77  107
   46  122 4058  112   47   57   96  161  217   63   92  367  149   74
   47  111 1315   53   85   71  293  765  138  237]

Most important clusters (by number of docs inside): [ 30  85 100  79  39  86  40  51  17  52]

Cluster 30 : [('others', 1276), ('seniors_clubs_social_groups', 739), ('state_primary_high_schools', 701), ('event', 696), ('crisis_emergency_accommodation', 690), ('community_clubs_interest_groups', 667), ('sports_clubs', 640), ('drug_alcohol_services

In [83]:
df.head()

Unnamed: 0,PartitionKey,UserHostAddress,WhatText,CategoryID,Postcode,no_punc,tokenized_sents,no_stopwords,n_gram,text_ready
0,20200101,2001:8004:13c1:ba59:2917:4cd9:5ca3:a5f1,general health services,80.0,4215,general health services,"[general, health, services]","[general, health, services]","[general, health, services]","[general, health, service]"
1,20200101,106.71.190.191,christmas fireworks,,4020,christmas fireworks,"[christmas, fireworks]","[christmas, fireworks]",[christmas_fireworks],[christmas_fireworks]
2,20200101,122.148.129.117,general welfare & support services,119.0,4215,general welfare support services,"[general, welfare, support, services]","[general, welfare, support, services]","[general, welfare_support_services]","[general, welfare_support_services]"
3,20200101,120.22.167.94,others,,4000,others,[others],[others],[others],[others]
4,20200101,2001:8004:1080:133e:386e:9184:a6ec:f67c,bunbury acute psychiatric unit,,4215,bunbury acute psychiatric unit,"[bunbury, acute, psychiatric, unit]","[bunbury, acute, psychiatric, unit]",[bunbury_acute_psychiatric_unit],[bunbury_acute_psychiatric_unit]


In [4]:
# Drop google bot IP
df = df.drop(df[df.UserHostAddress.str.contains("66.249")].index)
print(df.shape)

(34590, 5)


In [24]:
df['Postcode'] = df['Postcode'].astype("str")
df = df[~df.Postcode.str.startswith('8')]
df = df[~df.Postcode.str.startswith('7')]
df = df[~df.Postcode.str.startswith('6')]
df = df[~df.Postcode.str.startswith('5')]
df = df[~df.Postcode.str.startswith('3')]
df = df[~df.Postcode.str.startswith('2')]
print(df["Postcode"].unique())

['4670' '4165' '4218' '4650' '4020' '4505' '4306' '4158' '4215' '4213'
 '4509' '4510' '4350' '4551' '4000' '4075' '4160' '4285' '4680' '4109'
 '4110' '4217' '4209' '4068' '4740' '4655' '4305' '4568' '4114' '4700'
 '4310' '4810' '4119' '4019' '4035' '4671' '4507' '4226' '4301' '4615'
 '4825' '4500' '4118' '4878' '4220' '4011' '4491' '4101' '4455' '4157'
 '4344' '4054' '4059' '4078' '4005' '4501' '4122' '4106' '4132' '4205'
 '4216' '4129' '4212' '4006' '4032' '4870' '4133' '4735' '4502' '4405'
 '4822' '4610' '4030' '4221' '4127' '4823' '4566' '4022' '4730' '4214'
 '4061' '4124' '4300' '4128' '4102' '4077' '4108' '4814' '4559' '4860'
 '4480' '4207' '4570' '4211' '4311' '4164' '4413' '4031' '4850' '4508'
 '4370' '4341' '4053' '4506' '4660' '4504' '4021' '4178' '4169' '4390'
 '4503' '4380' '4829' '4115' '4131' '4558' '4737' '4519' '4573' '4820'
 '4069' '4352' '4343' '4854' '4575' '4474' '4064' '4560' '4309' '4304'
 '4725' '4223' '4174' '4580' '4736' '4163' '4487' '4880' '4340' '4817'
 '4017