# knowledge-exploration: Text Preprocessing

The experiment has been done by suggesting 5 ill-defined questions, and the participants are required to write 5 different answers in random order.

**Goal** is to cluster the answers in each question and see how many clusters are built, and the relation between each cluster.

In [1]:
import pandas as pd
import numpy as np
import os

In [143]:
df = pd.read_csv("~/knowledge-explore/data/processed_data/reference_dataset_v1.csv")

In [144]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,problem_id,participant_id,DAT,CRT2,CURIOSITY_SCORE,AGE,GENDER,EDUCATION,EMPLOYMENT,...,POLITICAL,CLIMATE,COVID,NEWS,MEDIA,RELATION_KNOWLEDGE,RELATION_INTEREST,RELATION_CONCERN,ideas,emb_ideas
0,0,Q1,278,74.104969,3,4.229167,24,1,5,1,...,2,1,1,4,5,2,4,4,"['vaccination schemes in these areas', 'increa...","[[-0.011656875722110271, -0.019422005861997604..."
1,1,Q2,278,74.104969,3,4.229167,24,1,5,1,...,2,1,1,4,5,3,5,5,['equal pay for women and men in the same role...,"[[-0.0034376916009932756, 0.006209875456988811..."
2,2,Q3,278,74.104969,3,4.229167,24,1,5,1,...,2,1,1,4,5,3,5,5,"['look into conservation plans ', 'ensure poac...","[[0.03516001999378204, -0.004544570576399565, ..."


In [145]:
df = df[['problem_id', 'participant_id', 'ideas']] # extract rows that are of use.

## Data cleaning

In [146]:
df

Unnamed: 0,problem_id,participant_id,ideas
0,Q1,00278,"['vaccination schemes in these areas', 'increa..."
1,Q2,00278,['equal pay for women and men in the same role...
2,Q3,00278,"['look into conservation plans ', 'ensure poac..."
3,Q4,00278,"['make it more affordable ', 'make it more rel..."
4,Q5,00278,"['a better work-life balance ', 'more flexible..."
...,...,...,...
1484,Q1,ff2bf,"['education of benefits to vaccination', 'rewa..."
1485,Q2,ff2bf,"['gender identity workshops for staff', 'look ..."
1486,Q3,ff2bf,"['ban ivory imports/exports', 'try to breed in..."
1487,Q4,ff2bf,"['free travel for those most in need', 'less p..."


In [147]:
df_1 = df.loc[df['problem_id'] == 'Q1'][['participant_id','ideas']]
df_2 = df.loc[df['problem_id'] == 'Q2'][['participant_id','ideas']]
df_3 = df.loc[df['problem_id'] == 'Q3'][['participant_id','ideas']]
df_4 = df.loc[df['problem_id'] == 'Q4'][['participant_id','ideas']]
df_5 = df.loc[df['problem_id'] == 'Q5'][['participant_id','ideas']]

In [148]:
# idea values are stored as string, not the list. So str -> list
import ast

df_1['ideas']= df_1['ideas'].map(ast.literal_eval)
df_2['ideas']= df_2['ideas'].map(ast.literal_eval)
df_3['ideas']= df_3['ideas'].map(ast.literal_eval)
df_4['ideas']= df_4['ideas'].map(ast.literal_eval)
df_5['ideas']= df_5['ideas'].map(ast.literal_eval)


# unlist and make new rows
df_1 = df_1.explode('ideas')
df_2 = df_2.explode('ideas')
df_3 = df_3.explode('ideas')
df_4 = df_4.explode('ideas')
df_5 = df_5.explode('ideas')

In [149]:
df_1

Unnamed: 0,participant_id,ideas
0,00278,vaccination schemes in these areas
0,00278,increase awareness and education
0,00278,mass vaccination programmes
0,00278,gp practices to discuss with their patients
0,00278,advertisement of vaccination needs and availab...
...,...,...
1479,fb9c4,visiting local schools and vaccinating childre...
1479,fb9c4,offering door to door vaccination programmes
1484,ff2bf,education of benefits to vaccination
1484,ff2bf,reward with an incentive eg cash or goods


In [150]:
# text-cleaning/processing

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pycld2 as cld2

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to /mnt/home/kim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/home/kim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /mnt/home/kim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [151]:
# Function to preprocess text
def preprocess_text(text):
    # 1. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # 2. Convert text to lowercase
    text = text.lower()
    
#     # 3. Language detection and filtering non-English texts
#     isReliable, textBytesFound, details = cld2.detect(text)
#     if details[0][0] != "ENGLISH":
#         return None
    
    # 4. Tokenize text
    tokens = word_tokenize(text)
    
    # 5. Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 6. Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Rejoin tokens into a single string
    return ' '.join(tokens)

In [152]:
# Apply preprocessing to the 'ideas' column
df_1['cleaned_ideas'] = df_1['ideas'].apply(preprocess_text)

# Remove rows where the text was non-English (resulting in None)
df_1 = df_1.dropna(subset=['cleaned_ideas']).reset_index(drop=True)

# Display the cleaned DataFrame
df_1.drop(columns = ['ideas'], inplace = True)
df_1 = df_1.rename(columns = {'cleaned_ideas' : 'answer'})

In [153]:
# Apply preprocessing to the 'ideas' column
df_2['cleaned_ideas'] = df_2['ideas'].apply(preprocess_text)

# Remove rows where the text was non-English (resulting in None)
df_2 = df_2.dropna(subset=['cleaned_ideas']).reset_index(drop=True)

# Display the cleaned DataFrame
df_2.drop(columns = ['ideas'], inplace = True)
df_2 = df_2.rename(columns = {'cleaned_ideas' : 'answer'})

In [154]:
# Apply preprocessing to the 'ideas' column
df_3['cleaned_ideas'] = df_3['ideas'].apply(preprocess_text)

# Remove rows where the text was non-English (resulting in None)
df_3 = df_3.dropna(subset=['cleaned_ideas']).reset_index(drop=True)

# Display the cleaned DataFrame
df_3.drop(columns = ['ideas'], inplace = True)
df_3 = df_3.rename(columns = {'cleaned_ideas' : 'answer'})

In [155]:
# Apply preprocessing to the 'ideas' column
df_4['cleaned_ideas'] = df_4['ideas'].apply(preprocess_text)

# Remove rows where the text was non-English (resulting in None)
df_4 = df_4.dropna(subset=['cleaned_ideas']).reset_index(drop=True)

# Display the cleaned DataFrame
df_4.drop(columns = ['ideas'], inplace = True)
df_4 = df_4.rename(columns = {'cleaned_ideas' : 'answer'})

In [156]:
# Apply preprocessing to the 'ideas' column
df_5['cleaned_ideas'] = df_5['ideas'].apply(preprocess_text)

# Remove rows where the text was non-English (resulting in None)
df_5 = df_5.dropna(subset=['cleaned_ideas']).reset_index(drop=True)

# Display the cleaned DataFrame
df_5.drop(columns = ['ideas'], inplace = True)
df_5 = df_5.rename(columns = {'cleaned_ideas' : 'answer'})

In [157]:
df_1.to_json("~/thesis/data/processed_data/knowledge_q1_lemmatized.jsonl", orient = 'records', lines = True)

In [158]:
df_2.to_json("~/thesis/data/processed_data/knowledge_q2_lemmatized.jsonl", orient = 'records', lines = True)

In [159]:
df_3.to_json("~/thesis/data/processed_data/knowledge_q3_lemmatized.jsonl", orient = 'records', lines = True)

In [160]:
df_4.to_json("~/thesis/data/processed_data/knowledge_q4_lemmatized.jsonl", orient = 'records', lines = True)

In [161]:
df_5.to_json("~/thesis/data/processed_data/knowledge_q5_lemmatized.jsonl", orient = 'records', lines = True)

### Summary of the preprocessing

1. Checked non-english answers.
2. Lower-cased
3. Removal of stopwords, special characters that are not alphabetical.
4. Stemming -> stem_data
5. Lemmatization -> lem_data
