In [1]:
!pip install --upgrade pyLDAvis
!pip install pandas==1.5.3

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1
Collecting pandas==1.5.3
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
     

## <font color="blue">Imports</font>

In [2]:
#data transformations
import os
import re
import numpy as np
import pandas as pd
from pprint import pprint

# nlp
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel
from gensim.test.utils import datapath
import nltk

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## <font color="blue">Functions</font>

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

def remove_stopwords(texts):
    keep_list = ["as", "is", "no", "only"]
    for w in keep_list:
      if w in stop_words:
        stop_words.remove(w)
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], allowed_words=[]):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        # keep words in allowed list and if they are bigrams
        texts_out.append([token.lemma_ for token in doc if (token.pos_ in allowed_postags) or (str(token) in allowed_words) or ("_" in str(token))])
    return texts_out

In [5]:
# Compute c_v coherence for various number of topics

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [6]:
def format_topics_sentences(ldamodel, corpus, texts):
  # Init output
  sent_topics_df = pd.DataFrame()

  # Get main topic in each document
  for i, row in enumerate(ldamodel[corpus]):
      row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
      # Get the Dominant topic, Perc Contribution and Keywords for each document
      for j, (topic_num, prop_topic) in enumerate(row):
          if j == 0:  # => dominant topic
              wp = ldamodel.show_topic(topic_num)
              topic_keywords = ", ".join([word for word, prop in wp])
              sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
          else:
              break
  sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

  # Add original text to the end of the output
  contents = pd.Series(texts)
  sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
  return(sent_topics_df)

## <font color="blue">Local & Constants</font>

In [7]:
# Import necessary libraries        # This works

# Py2_Input.csv


from google.colab import files
import pandas as pd

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

# Prompt user to upload a file
uploaded = files.upload()

# Assuming the uploaded file is a CSV, you can read it into a DataFrame
# Get the name of the uploaded file
file_name = next(iter(uploaded))

# Load the dataset into a DataFrame
df = pd.read_csv(file_name)
#df = df.sample(n=500, random_state=42)
# Optionally, if you want to inspect the DataFrame
print(df.head())


Saving output (7).csv to output (7).csv
               line        city state_code  postal_code  property_id  \
0      3705 7th Ave  Birmingham         AL        35224  M7358062309   
1     1148 1st St N  Birmingham         AL        35204  M7067072160   
2  1229 15th Way SW  Birmingham         AL        35211  M7396479920   
3     426 80th St S  Birmingham         AL        35206  M8264613223   
4  914 Knoxville Pl  Birmingham         AL        35224  M7604129328   

                                         rdc_web_url  \
0  https://www.realtor.com/realestateandhomes-det...   
1  https://www.realtor.com/realestateandhomes-det...   
2  https://www.realtor.com/realestateandhomes-det...   
3  https://www.realtor.com/realestateandhomes-det...   
4  https://www.realtor.com/realestateandhomes-det...   

                                         Description  \
0  Calling All Investors!! Home sold AS IS This i...   
1  Solid investment property with great bones and...   
2  Investment property

In [8]:
df1=df


In [9]:
# prompt: Make the df as dataframe  - This is needed...

import pandas as pd
df = pd.DataFrame(df)
df

Unnamed: 0,line,city,state_code,postal_code,property_id,rdc_web_url,Description,norm_desc,human_label
0,3705 7th Ave,Birmingham,AL,35224,M7358062309,https://www.realtor.com/realestateandhomes-det...,Calling All Investors!! Home sold AS IS This i...,call investors home sell nice split level home...,distressed
1,1148 1st St N,Birmingham,AL,35204,M7067072160,https://www.realtor.com/realestateandhomes-det...,Solid investment property with great bones and...,solid investment property great bone fantastic...,distressed
2,1229 15th Way SW,Birmingham,AL,35211,M7396479920,https://www.realtor.com/realestateandhomes-det...,Investment property currently rented at $795 p...,investment property currently rent 795 per mon...,distressed
3,426 80th St S,Birmingham,AL,35206,M8264613223,https://www.realtor.com/realestateandhomes-det...,This 4 sides brick home is the ideal investmen...,4 side brick home ideal investment property wh...,distressed
4,914 Knoxville Pl,Birmingham,AL,35224,M7604129328,https://www.realtor.com/realestateandhomes-det...,Don't miss out on this four sided brick home! ...,dont miss four side brick home home would grea...,distressed
...,...,...,...,...,...,...,...,...,...
12632,141 Lansing St,Madison,WI,53714,M7936597020,https://www.realtor.com/realestateandhomes-det...,back on market. Classic 1940's cape cod waitin...,back market classic 1940s cape cod wait bring ...,distressed
12633,305 S Walbridge Ave,Madison,WI,53714,M7730824116,https://www.realtor.com/realestateandhomes-det...,Don't miss out on this opportunity! Solid home...,dont miss opportunity solid home great neighbo...,distressed
12634,706 Meadowlark Dr,Madison,WI,53714,M7590974999,https://www.realtor.com/realestateandhomes-det...,"Run, don't walk to this one owner, custom buil...",run dont walk one owner custom build ranch des...,distressed
12635,721 Redland Dr,Madison,WI,53714,M8668798258,https://www.realtor.com/realestateandhomes-det...,Opportunity awaits! With some TLC this 2 bedro...,opportunity await tlc 2 bedroom 2 bathroom eas...,distressed


In [10]:
# prompt: Change the column name Description to original_description

df = df.rename(columns={'Description': 'original_description'})
df = df.rename(columns={'norm_desc': 'normalized_description'})
df = df.rename(columns={'label': 'keyword_label'})
df

Unnamed: 0,line,city,state_code,postal_code,property_id,rdc_web_url,original_description,normalized_description,human_label
0,3705 7th Ave,Birmingham,AL,35224,M7358062309,https://www.realtor.com/realestateandhomes-det...,Calling All Investors!! Home sold AS IS This i...,call investors home sell nice split level home...,distressed
1,1148 1st St N,Birmingham,AL,35204,M7067072160,https://www.realtor.com/realestateandhomes-det...,Solid investment property with great bones and...,solid investment property great bone fantastic...,distressed
2,1229 15th Way SW,Birmingham,AL,35211,M7396479920,https://www.realtor.com/realestateandhomes-det...,Investment property currently rented at $795 p...,investment property currently rent 795 per mon...,distressed
3,426 80th St S,Birmingham,AL,35206,M8264613223,https://www.realtor.com/realestateandhomes-det...,This 4 sides brick home is the ideal investmen...,4 side brick home ideal investment property wh...,distressed
4,914 Knoxville Pl,Birmingham,AL,35224,M7604129328,https://www.realtor.com/realestateandhomes-det...,Don't miss out on this four sided brick home! ...,dont miss four side brick home home would grea...,distressed
...,...,...,...,...,...,...,...,...,...
12632,141 Lansing St,Madison,WI,53714,M7936597020,https://www.realtor.com/realestateandhomes-det...,back on market. Classic 1940's cape cod waitin...,back market classic 1940s cape cod wait bring ...,distressed
12633,305 S Walbridge Ave,Madison,WI,53714,M7730824116,https://www.realtor.com/realestateandhomes-det...,Don't miss out on this opportunity! Solid home...,dont miss opportunity solid home great neighbo...,distressed
12634,706 Meadowlark Dr,Madison,WI,53714,M7590974999,https://www.realtor.com/realestateandhomes-det...,"Run, don't walk to this one owner, custom buil...",run dont walk one owner custom build ranch des...,distressed
12635,721 Redland Dr,Madison,WI,53714,M8668798258,https://www.realtor.com/realestateandhomes-det...,Opportunity awaits! With some TLC this 2 bedro...,opportunity await tlc 2 bedroom 2 bathroom eas...,distressed


## <font color="blue">Transformations</font>

In [11]:
# set list of text
text = df.original_description.values.tolist()
text[:10]

['Calling All Investors!! Home sold AS IS This is a nice split level home corned lot home. Great investment property! Just need some TLC.',
 'Solid investment property with great bones and fantastic location. All brick home with tons of space. Fix and flip or add to your rental properties. You will not want to let this one get away! Bedrooms are large, living room and separate den/living area. Spacious kitchen and separate dining area.',
 'Investment property currently rented at $795 per month. Recently renovated and professionally managed. Terrific street. Step right in to cashflow! Lease runs until January. 48hr notice required for showings.',
 'This 4 sides brick home is the ideal investment property. Whether you are new to the investment world or you are a seasoned veteran, this property is cash flowing with a tenant in place. No immediate expenditures necessary. The market in this area is hot and ripe! So run your numbers and take advantage of this deal today!',
 "Don't miss out o

In [12]:
# prepare stop words
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend([]) # add stop words here

In [13]:
# tokenize and clean up text
data_words = list(sent_to_words(text))
print(data_words[:1])

[['calling', 'all', 'investors', 'home', 'sold', 'as', 'is', 'this', 'is', 'nice', 'split', 'level', 'home', 'corned', 'lot', 'home', 'great', 'investment', 'property', 'just', 'need', 'some', 'tlc']]


In [14]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=30) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See bigram example
print(bigram_mod[bigram_mod[data_words[0]]])

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['calling', 'all', 'investors', 'home', 'sold_as', 'is', 'this', 'is', 'nice', 'split', 'level', 'home', 'corned', 'lot', 'home', 'great', 'investment', 'property', 'just', 'need', 'some_tlc']
['calling_all', 'investors', 'home', 'sold_as_is', 'this', 'is', 'nice', 'split_level', 'home', 'corned', 'lot', 'home', 'great_investment', 'property', 'just', 'need_some_tlc']


In [15]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_words=['tlc', 'rental', 'cashflow', 'rehab'])

print(data_lemmatized[:1])

[['call', 'investor', 'home', 'sold_as', 'nice', 'split', 'level', 'home', 'corn', 'lot', 'home', 'great', 'investment', 'property', 'need', 'tlc']]


In [16]:
data_lemmatized

[['call',
  'investor',
  'home',
  'sold_as',
  'nice',
  'split',
  'level',
  'home',
  'corn',
  'lot',
  'home',
  'great',
  'investment',
  'property',
  'need',
  'tlc'],
 ['solid',
  'investment',
  'property',
  'great',
  'bone',
  'fantastic',
  'location',
  'brick',
  'home',
  'ton',
  'space',
  'fix_flip',
  'add',
  'rental_propertie',
  'want',
  'let',
  'get',
  'bedroom',
  'large',
  'living',
  'room',
  'separate',
  'den',
  'living',
  'area',
  'spacious',
  'kitchen',
  'separate',
  'dining',
  'area'],
 ['investment',
  'property',
  'currently_rente',
  'per_month',
  'recently',
  'renovate',
  'professionally_manage',
  'terrific',
  'street',
  'step',
  'right',
  'cashflow',
  'lease',
  'run',
  'hr_notice',
  'require',
  'showing'],
 ['side',
  'brick',
  'home',
  'ideal',
  'investment',
  'property',
  'new',
  'investment',
  'world',
  'season',
  'veteran',
  'property',
  'cash_flowe',
  'tenant',
  'place',
  'immediate',
  'expenditure',

In [17]:
# view_example
idx = 20
print("ORIGINAL:")
print(df['original_description'].iloc[idx])
print("NO STOP WORDS:")
print(' '.join(data_words_nostops[idx]))
print("BIGRAM:")
print(' '.join(data_words_bigrams[idx]))
print("LEMMATIZE:")
print(' '.join(data_lemmatized[idx]))

ORIGINAL:
Great one level home at end of dead end street. Perfect for family home, empty nester, or investor. Hardwood flooring in Living Room & Dining Room. Nice sized bedrooms, plus a bonus room and covered porch. Property sold as-is. No sight unseen offer will be considered and all inspections should be done prior to submitting an offer.
NO STOP WORDS:
great one level home end dead end street perfect family home empty nester investor hardwood flooring living room dining room nice sized bedrooms plus bonus room covered porch property sold as is no sight unseen offer considered inspections done prior submitting offer
BIGRAM:
great one level home end dead_end street perfect family home empty nester investor hardwood flooring living room dining room nice sized bedrooms plus bonus room covered porch property sold_as is no sight_unseen offer considered inspections done prior submitting offer
LEMMATIZE:
great level home end dead_end street perfect family home empty nester investor hardwood

In [18]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
# print(corpus[:1])
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('call', 1),
  ('corn', 1),
  ('great', 1),
  ('home', 3),
  ('investment', 1),
  ('investor', 1),
  ('level', 1),
  ('lot', 1),
  ('need', 1),
  ('nice', 1),
  ('property', 1),
  ('sold_as', 1),
  ('split', 1),
  ('tlc', 1)]]

#### <font color="purple">Topic Modeling: LDA</font>

In [19]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.039*"buyer" + 0.028*"seller" + 0.027*"offer" + 0.023*"pm" + '
  '0.019*"verify" + 0.017*"due" + 0.017*"obtain" + 0.016*"property" + '
  '0.016*"multiple_offer" + 0.012*"agent"'),
 (1,
  '0.036*"room" + 0.034*"bedroom" + 0.029*"home" + 0.024*"large" + '
  '0.018*"kitchen" + 0.018*"bath" + 0.016*"space" + 0.014*"living" + '
  '0.012*"area" + 0.012*"yard"'),
 (2,
  '0.058*"home" + 0.032*"great" + 0.016*"lot" + 0.015*"property" + '
  '0.014*"close" + 0.014*"location" + 0.012*"make" + 0.011*"potential" + '
  '0.010*"downtown" + 0.010*"bath"'),
 (3,
  '0.123*"new" + 0.038*"update" + 0.029*"floor" + 0.023*"roof" + '
  '0.021*"window" + 0.020*"kitchen" + 0.019*"appliance" + 0.017*"paint" + '
  '0.016*"home" + 0.015*"bathroom"')]


In [21]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.835746133224248

Coherence Score:  0.5415260122688559


In [22]:
# Can take a long time to run.
limit=5
step=1
%time model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=limit, step=step)

CPU times: user 1min 58s, sys: 1.18 s, total: 1min 59s
Wall time: 1min 59s


In [23]:
x = list(range(2, limit, step))
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=coherence_values))
fig.update_layout(title='Choose Optimal Model with Coherence Scores',
                   xaxis_title='Num Topics',
                   yaxis_title='Coherence Score')
fig.show()

In [24]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.5285
Num Topics = 3  has Coherence Value of 0.4328
Num Topics = 4  has Coherence Value of 0.5415


In [25]:
import os
from google.colab import drive

# Check if the drive is already mounted
if not os.path.ismount('/content/drive'):
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

# Define project path
project_path = '/content/drive/My Drive/my_project_Final'

# Check if the directory exists and create if not
if not os.path.exists(project_path):
  os.makedirs(project_path)
  print(f"Created directory: {project_path}")
else:
  print(f"Directory already exists: {project_path}")

# Change to the project directory
os.chdir(project_path)

# Verify the current working directory
print(os.getcwd())


ValueError: mount failed

In [None]:
# Select the model and print the topics
optimal_model = model_list[2] # 4 topics
# Save model to disk.
temp_file = '/content/drive/My Drive/my_project_Final/lda_model.model'
optimal_model.save(temp_file)
optimal_model = LdaModel.load(temp_file)
# show topics
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))
####YAYYYYYY###

In [None]:
import os
from google.colab import drive
import pyLDAvis
import pyLDAvis.gensim

# ... (your other code) ...

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)

# Ensure the directory exists before saving the file
file_path = '/content/drive/My Drive/my_project_Final/lda_vis_topics.html'
file_dir = os.path.dirname(file_path)

# Create the directory if it doesn't exist
if not os.path.exists(file_dir):
    os.makedirs(file_dir)

# Save the visualization as an HTML file
pyLDAvis.save_html(vis, file_path)

vis


In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=text)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
# Check the structure of df_topic_sents_keywords
print(df_topic_sents_keywords.head())
print(df_topic_sents_keywords.columns)
print(df_topic_sents_keywords.shape)


In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=text)

# Check the structure of the DataFrame before renaming
print(df_topic_sents_keywords.head())
print(df_topic_sents_keywords.columns)
print(df_topic_sents_keywords.shape)

# Format and drop index if not needed
df_dominant_topic = df_topic_sents_keywords.reset_index(drop=True)

# Get the existing column names
existing_columns = df_dominant_topic.columns.tolist()

# Define the desired new column names
new_columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Check if the number of new columns matches the existing columns
if len(existing_columns) == len(new_columns):
    # Assign the new column names
    df_dominant_topic.columns = new_columns
else:
    # Print an error message or handle the mismatch in an appropriate way
    print(f"Error: Number of new columns ({len(new_columns)}) does not match the number of existing columns ({len(existing_columns)})")
    print(f"Existing columns: {existing_columns}")

# Show the top 10 rows
df_dominant_topic.head(10)

In [None]:
# prompt: Using dataframe df_dominant_topic: Give me unique dominant topics and its associated Topic_Keywords and print in a excel dataframe

import pandas as pd

# Group by 'Dominant_Topic' and get the unique 'Topic_Keywords' for each topic.
dominant_topics = df_dominant_topic.groupby('Dominant_Topic')['Topic_Keywords'].unique()

# Create a new DataFrame from the grouped data.
dominant_topic_df = pd.DataFrame({'Dominant_Topic': dominant_topics.index, 'Topic_Keywords': dominant_topics.values})


# Export the DataFrame to an Excel file.
dominant_topic_df.to_excel('dominant_topics.xlsx', index=False)

In [None]:
dominant_topic_df

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf,
                grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
                axis=0)

# Reset Index
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf

In [None]:
sent_topics_sorteddf.loc[sent_topics_sorteddf['Topic_Num'] == 2]['Text'].iloc[0]

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.sort_values(by=['Perc_Documents']).head()



### <font color="green">Research Question 2</font>

In [None]:
!pip install mlflow --quiet
!pip install pyngrok --quiet
!pip install lime --quiet

In [None]:
import os
import re
import numpy as np
import pandas as pd
import string

import nltk

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import sklearn.metrics as metrics
import sklearn.ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

# Lime
from lime.lime_text import LimeTextExplainer
import mlflow
from pyngrok import ngrok


# Chart
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)


In [None]:
#make all text lowercase
def text_lowercase(text):
    return text.lower()

#remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

#remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


#function for all pre-processing steps
def preprocessing(text):
    text = text_lowercase(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    return text

def process_text_list(text):
  #pre-processing the text body column
  pp_text = []
  for text_data in text:
    #check if string
    if isinstance(text_data, str):
      pp_text_data = preprocessing(text_data)
      pp_text.append(pp_text_data)
      #if not string
    else:
      pp_text.append(np.NaN)
  return pp_text

def get_stop_words(extend_list=[], remove_list=[]):
    #prepare stop words
    stop_words = nltk.corpus.stopwords.words('english')
    #add stop words
    stop_words.extend(extend_list)
    #remove stop words
    for w in remove_list:
      if w in stop_words:
        stop_words.remove(w)
    return stop_words



In [None]:
# prompt: Change the column name Description to original_description in df1

# Assuming df1 is already defined as in your provided code.
# Rename the column 'Description' to 'original_description' in df1
df1 = df1.rename(columns={'Description': 'original_description'})

In [None]:
df1

In [None]:
# group by label
#df1.groupby(['dataset_type', 'human_label'])['line'].count()

In [None]:
df_unlabeled = df1.loc[df1['human_label'].isin(['unknown'])]
print('Unlabeled:', len(df_unlabeled))
df1 = df1.loc[df1['human_label'].isin(['distressed', 'not-distressed'])]
# Create Label
df1['label'] = df1.apply(lambda x: 1 if x['human_label'] == 'distressed' else 0, axis=1)
print('Labeled"', len(df1))
df1.drop_duplicates().groupby(['label'])['line'].count()



In [None]:
# WORKSSSSSSSSSSSSSS
# prompt: Give me the count of different words in column label

# Assuming 'df1' is your DataFrame and 'label' is the column name
word_counts = {}
for label in df1['label']:  # Replace 'label' with your actual column name
    if isinstance(label, str):
        words = label.lower().split()
        for word in words:
            if word not in word_counts:
                word_counts[word] = 0
            word_counts[word] += 1

print("Word counts in the 'label' column:")
for word, count in word_counts.items():
    print(word, count)

print("\nTotal unique words:", len(word_counts))

In [None]:
# Set list of text
text = df1.original_description.values.tolist()
text[:10]

In [None]:
#Preprocess text
pp_text = process_text_list(text)
df_preprocessed = df1.copy()
# Add pre-processed column to the dataset
df_preprocessed['pp_text'] = pp_text
df_preprocessed['pp_text'][:10]


In [None]:
df1

In [None]:
#Preprocess Unlabelled
text_u = df_unlabeled.original_description.values.tolist()
pp_text_u = process_text_list(text_u)
df_preprocess_u = df_unlabeled.copy()
# Add pre-processed column to the dataset
df_preprocess_u['pp_text'] = pp_text_u
df_preprocess_u['pp_text'][:10]

In [None]:
# Prepare Stop words
mdl_stop_words = get_stop_words(remove_list=['as', 'is','no','only'])


In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
seed = 0
#get data set
df_shuffle = shuffle(df_preprocessed, random_state=seed)
X = df_shuffle['pp_text']
y = df_shuffle['label']

#Set Results
cv_score_dict = {}

#create pipeline
tf_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents= 'ascii', stop_words=mdl_stop_words, ngram_range=(1,2)))
])

classifiers_dict = {
    'LogisticRegression': LogisticRegression(verbose=1, random_state=seed, penalty = 'l2', solver = 'newton-cg'),
    'KNeighbors': KNeighborsClassifier(2),
    'SVC': SVC(kernel="rbf", C=0.025, probability=True, random_state=seed),
    'NuSVC': NuSVC(probability=True, random_state=seed),
    'DecisionTree': DecisionTreeClassifier(random_state=seed),
    'RandomForest': RandomForestClassifier(random_state=seed),
    'AdaBoost': AdaBoostClassifier(random_state=seed),
    'XGBoost': GradientBoostingClassifier(random_state=seed),
    'MLP': MLPClassifier(random_state=seed)
}


%time
for c in classifiers_dict.keys():
  print('|--------------------|')
  print(c)
  print('|--------------------|')
  with mlflow.start_run(run_name= 'MLflow on Colab'):

    #enable autologging
    mlflow.sklearn.autolog()
    pipe = Pipeline(steps=[('preprocessor', tf_transformer),
                           ('classifier', classifiers_dict[c])])

    #cross validate
    cv_score = cross_validate(pipe, X, y, cv=3, scoring=('accuracy', 'f1', 'precision', 'recall'))
    cv_score_dict[c] = cv_score
    print(cv_score)
    print('|--------------------|')



In [None]:
#Run tracking UI in the background
get_ipython().system_raw("mlflow ui --port 5000 &") #Run tracking UI in the background

#Create remote tunnel using ngrok.com to allow local port access
#Terminate open tunnels if exist
ngrok.kill()

#Set the authtoken to your Ngrok auth (Optional)
# Get your autoken from https://dashboard.ngrok.com/auth
NGROK_AUTH_TOKEN = "2nJzAw6HCzPZMRa96a6rw6hc8Om_7oVeVQMG8dZNASXzmRf3k"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open an HTTPs tunnel on port 5000 for tracking UI https://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)




In [None]:
avg_cv_score_dict = {}
for clfr in cv_score_dict.keys():
  avg_cv_score_dict[clfr] = {}
  for metric_type in cv_score_dict[clfr].keys():
    avg_cv_score_dict[clfr][metric_type] = np.median(cv_score_dict[clfr][metric_type])
    avg_cv_score_dict

df_mdl_metrics = pd.DataFrame(avg_cv_score_dict).T.sort_values(by='test_f1', ascending=True)
df_mdl_metrics = df_mdl_metrics.reset_index().rename(columns={'index': 'model'})
df_mdl_metrics




In [None]:
#Plot model performance
colors = ['lightslategray'] * len(df_mdl_metrics)
colors[-1] = 'crimson'

fig = go.Figure(data=[go.Bar(
            y=df_mdl_metrics['model'],
            x=df_mdl_metrics['test_f1'],
            orientation='h',
            text = ['test_f1'],
            marker_color=colors #Market color can be single color value or an iterable
        )])
fig.update_layout(title_text='Model Performance: F1 Score')

In [None]:
#vectoriser
tf = TfidfVectorizer(strip_accents= 'ascii', stop_words=mdl_stop_words, ngram_range=(1,2))
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=0)
#transform and fit the training set with vectoriser
X_train_tf = tf.fit_transform(X_train)
# Transform the test set with Vectoriser
X_test_tf = tf.transform(X_test)


In [None]:
# Train Model
mlp = MLPClassifier(random_state=seed)
mlp.fit(X_train_tf, y_train)
#make prediction
y_pred = mlp.predict(X_test_tf)
y_proba = mlp.predict_proba(X_test_tf)
#Classification Report
target_names = ['not-rehab', 'rehab']
print(classification_report(y_test, y_pred, target_names=target_names))


In [None]:
cv_score_dict


In [None]:
def classification_matrix_labels(x):
  if x['model_label'] == 1:
    if x['label'] == x['model_label']:
      return 'tp'
    else:
      return 'fp'
  elif x['model_label'] == 0:
    if x['label'] == x['model_label']:
      return 'tn'
    else:
      return 'fn'
df_results = pd.DataFrame(zip(X_test, y_test, y_pred, y_proba), columns=['text', 'label', 'model_label', 'prob'])
df_results['classification'] = df_results.apply(lambda x: classification_matrix_labels(x), axis=1)
print(df_results.groupby(['classification'])['text'].count())
df_fp = df_results.loc[df_results['classification'].isin(['fp'])]
df_fp['class_proba'] = df_fp.apply(lambda x: x['prob'][-1], axis=1)
df_fp = df_fp.sort_values(by=['class_proba'], ascending=False)
df_fn = df_results.loc[df_results['classification'].isin(['fn'])]
df_fn['class_proba'] = df_fn.apply(lambda x: x['prob'][0], axis=1)
df_fn = df_fn.sort_values(by=['class_proba'], ascending=False)
df_fp.head()


LIME

In [None]:
# Converting the vectoriser and model into a pipeline
# This is necessary as LIME takes a model PIPELINE as an input
c = make_pipeline(tf, mlp)

# Saving a list of strings version of the X_test object
unlabeled_text_pp = df_preprocess_u.pp_text.tolist()
ls_X_test = list(unlabeled_text_pp)

#Saving the class names in a dictionary to increase interpretability
class_names = {0: 'not-distressed', 1: 'distressed'}

#Create the Lime Explainer
LIME_explainer = LimeTextExplainer(class_names=class_names)


In [None]:
def lime_explain(LIME_explainer, c, pred):
  LIME_exp = LIME_explainer.explain_instance(pred, c.predict_proba)

  #Print Results
  print('Property Description:', pred)
  print('Probability rehab =', c.predict_proba([pred]).round(3)[0][1])

  return LIME_exp.show_in_notebook(text=True)

Rehab(False Positive)

In [None]:
df_fp.head(2).tail(1) #Select one example

In [None]:
#Model is correct, this record is mislabeled
lime_explain(LIME_explainer, c, df_fp['text'].iloc[2])