## RQ3 Full Process

General plan:
- Separate the main corpus into two separate ones: one where the dominant consensus class is Non-Consensus (max_class=3) and one where the dominant consensus is Full Conensus Forest (max_class=6).
- Create tokens by performing the same text processing steps as in rq2_step2_text_analysis.ipynb (ie. cleaning, translating, tokenising, filtering)
- Assign each token their cluster from rq2_step2_text_analysis.ipynb
- Isolate the clusters I'm interested in 
- Compare those clusters (cluster membership metric?)

To do: think about how to handle the max class - should it be the max across all classes, the max across class 3, 4, 5, 6 or the max between 3 and 6

In [1]:
# SETUP

# Import packages
import pandas as pd
import pickle

import spacy
from deep_translator import GoogleTranslator
from collections import Counter


### Step 1: Create separate corpora

Separate the main corpus into two separate ones: one where the dominant consensus class is Non-Consensus (max_class=3) and one where the dominant consensus is Full Conensus Forest (max_class=6).

In [3]:
# STEP 1: SEPARATE CORPORA

# Load the master CSV from rq2_step1_data_collection
master = pd.read_csv("./processing/master.csv")

# Separate the master corpus into 2 corpora for class 3 and 6
corpus_3 = master[master["max_class"] == 3]
corpus_6 = master[master["max_class"] == 6]

### Step 2: Create Tokens

Same steps as in RQ2

for now I've just duplicated everything for class3 and class6, but ideally I will come back to this to make it all more efficient

In [4]:
# STEP 2.1: CLEAN

# Add ['None'] to any blank rows
# this is necessary for the next step, but then they will be removed later
corpus_3.fillna("['None']", inplace=True)
corpus_6.fillna("['None']", inplace=True)

# Extract the description and captitions and combine them into a single column
raw_text_c3 = pd.DataFrame()
raw_text_c3["desc_capt"] = corpus_3["description text"] + " " + corpus_3["photo_captions"]
raw_text_c6 = pd.DataFrame()
raw_text_c6["desc_capt"] = corpus_6["description text"] + " " + corpus_6["photo_captions"]

# Now remove all the ['None'] text from both columns
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\['None'\]", "", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\['None'\]", "", regex=True)

# Remove certain special characters
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\[", "", regex=True)
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\]", "", regex=True)
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\'", "", regex=True)
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\|", "", regex=True)
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\\", "", regex=True)
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\/", "", regex=True)
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\+", "", regex=True)
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"=", "", regex=True)

raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\[", "", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\]", "", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\'", "", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\|", "", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\\", "", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\/", "", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\+", "", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"=", "", regex=True)

# This is to address a specific issue in one of the entries
raw_text_c3["desc_capt"] = raw_text_c3["desc_capt"].str.replace(r"\n", " ", regex=True)
raw_text_c6["desc_capt"] = raw_text_c6["desc_capt"].str.replace(r"\n", " ", regex=True)

# Create a list from the column
raw_text_c3_list = raw_text_c3["desc_capt"].astype(str).values.tolist()
raw_text_c6_list = raw_text_c6["desc_capt"].astype(str).values.tolist()

# Convert entries which are just a space (" ") to be empty ("")
raw_text_c3_list = [x.strip(' ') for x in raw_text_c3_list]
raw_text_c6_list = [x.strip(' ') for x in raw_text_c6_list]

# Remove all empty entries
raw_text_c3_list = list(filter(None, raw_text_c3_list))
raw_text_c6_list = list(filter(None, raw_text_c6_list))


# Check
#raw_text_c6_list

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_3.fillna("['None']", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_6.fillna("['None']", inplace=True)


In [5]:
# STEP 2.2: FILTER OUT SHORT TEXTS

# Load the spacy model
nlp = spacy.load("de_core_news_sm")

# Create an empty list to store the unique token counts for each trail
unique_token_counts_c3 = []
unique_token_counts_c6 = []

# Tokenise the text for each trail & count the number of unique tokens for each trail
for trail_text in raw_text_c3_list:
    doc = nlp(trail_text)
    tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_space]
    unique_tokens = set(tokens)
    unique_token_counts_c3.append(len(unique_tokens))

for trail_text in raw_text_c6_list:
    doc = nlp(trail_text)
    tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_space]
    unique_tokens = set(tokens)
    unique_token_counts_c6.append(len(unique_tokens))

# Combine results into a df
raw_text_counts_c3 = pd.DataFrame()
raw_text_counts_c3["text"] = raw_text_c3_list
raw_text_counts_c3["unique_tokens"] = unique_token_counts_c3

raw_text_counts_c6 = pd.DataFrame()
raw_text_counts_c6["text"] = raw_text_c6_list
raw_text_counts_c6["unique_tokens"] = unique_token_counts_c6

# Filter df to only include rows where unique_tokens >= 3
raw_text_counts_c3 = raw_text_counts_c3.loc[(raw_text_counts_c3["unique_tokens"] >= 3)]
raw_text_counts_c6 = raw_text_counts_c6.loc[(raw_text_counts_c6["unique_tokens"] >= 3)]

# Save the text column as a list for use in the next steps
raw_text_3token_c3_list = raw_text_counts_c3["text"].astype(str).values.tolist()
raw_text_3token_c6_list = raw_text_counts_c6["text"].astype(str).values.tolist()

# Check
raw_text_3token_c3_list


['Breite Forstwege und Pfade. Auf halber Strecke kann man im Bergwerkstüble einkehren (Montag Ruhetag)',
 'Rinken - Feldberg-Ort Foto, Foto, Foto, Foto, Foto, Foto, Foto, Foto',
 'Schauinsland hike Foto',
 'Feldbergerhof - Feldberg-Ort Foto, Foto, Foto, Foto, Foto, Foto',
 'Ruta circular apta para todo tipo de personas. El pico es redondeado y ofrece una buenas vistas de la selva negra y la vertiente Norte de los Alpes suizos, austriacos y montañas de Liechtenstein.  Feldberg (1493 m)., Techo de la Selva Negra (Alemania)',
 'Bifurcación izquierda, Lago Feldsee, Lago. Banco, Cima, Monolito',
 'Titisee Camping Bankenhof',
 'Estany a Ruhestein',
 'Von St. Bartholomä zunächst zur schönen Eiskapelle am Fuss der Watzmann Ostwand, dann auf gleichem Weg zurück nach St. Bartholomä. Dann entlang des Uferns Richtung Norden, bald beginnt der Rinnkendlsteig durch den Wald und wird oben zunehmend ausgesetzter. Wanderer mit Höhenangst sollten diesen Steig besser nicht gehen. Man erreicht schließlich 

In [6]:
# STEP 2.3: DUPLICATE TEXT HANDLING

# Check for duplicates and preserve order
seen_c3 = set()
raw_text_3token_c3_list_unq = []
for item in raw_text_3token_c3_list:
    if item not in seen_c3:
        seen_c3.add(item)
        raw_text_3token_c3_list_unq.append(item)

len(raw_text_3token_c3_list_unq)

seen_c6 = set()
raw_text_3token_c6_list_unq = []
for item in raw_text_3token_c6_list:
    if item not in seen_c6:
        seen_c6.add(item)
        raw_text_3token_c6_list_unq.append(item)


len(raw_text_3token_c6_list_unq)

1325

In [9]:
# STEP 2.4: COUNT >5000

# Create starting point for count
count_5000_c3 = 0
count_5000_c6 = 0

# Count trails with more than 5000 characters
for trail_text in raw_text_3token_c3_list_unq:
    if len(trail_text) > 5000:
        count_5000_c3 += 1

for trail_text in raw_text_3token_c6_list_unq:
    if len(trail_text) > 5000:
        count_5000_c6 += 1

# Show the count
print(count_5000_c3, count_5000_c6)

0 1


In [10]:
# STEP 2.5: TRUNCATE >5000 (ONLY NEEDED FOR C6)

# Create a new list for the truncated trail text
raw_text_3token_c6_list_unq_trunc = []

# Truncate any text greater than 5000 characters
# Also remove the last word in case it's only a partial word
for trail_text in raw_text_3token_c6_list_unq:
    if len(trail_text) > 5000:
        trail_text_trunc = trail_text[:5000]
        trail_text_trunc = trail_text_trunc.rsplit(' ', 1)[0]
        raw_text_3token_c6_list_unq_trunc.append(trail_text_trunc)
    else:
        raw_text_3token_c6_list_unq_trunc.append(trail_text)

# Check to make sure removal was successful
# Create starting point for check count
check_5000_c6 = 0

# Count trails with more than 5000 characters in the new list
for trail_text in raw_text_3token_c6_list_unq_trunc:
    if len(trail_text) > 5000:
        check_5000_c6 += 1

# Show the check count
check_5000_c6

0

In [None]:
# STEP 2.6: TRANSLATE TO GERMAN (RUN ONCE)
# TAKES ABOUT 10 MIN

raw_text_de_c3 = []
translated_count_c3 = 0
skipped_count_c3 = 0

raw_text_de_c6 = []
translated_count_c6 = 0
skipped_count_c6 = 0

# Use deep translator to automatically detect language and translate to German
# If German is detected it will skip the entry (ie. it will not try to translate)
for trail_text in raw_text_3token_c3_list_unq:
    translated_c3 = GoogleTranslator(source='auto', target='de').translate(text=trail_text)

    # Check if translation changed anything
    if translated_c3.strip() == trail_text.strip():
        skipped_count_c3 += 1
    else:
        translated_count_c3 += 1

    raw_text_de_c3.append(translated_c3)

for trail_text in raw_text_3token_c6_list_unq_trunc:
    translated_c6 = GoogleTranslator(source='auto', target='de').translate(text=trail_text)

    # Check if translation changed anything
    if translated_c6.strip() == trail_text.strip():
        skipped_count_c6 += 1
    else:
        translated_count_c6 += 1

    raw_text_de_c6.append(translated_c6)

print(f"Translated for C3: {translated_count_c3}")
print(f"Skipped for C3 (already German or unchanged): {skipped_count_c3}")
print(f"Percent translated for C3: {round(((translated_count_c3/(translated_count_c3+skipped_count_c3))*100),2)}")

print(f"Translated for C6: {translated_count_c6}")
print(f"Skipped for C6 (already German or unchanged): {skipped_count_c6}")
print(f"Percent translated for C6: {round(((translated_count_c6/(translated_count_c6+skipped_count_c6))*100),2)}")


Translated for C3: 28
Skipped for C3 (already German or unchanged): 39
Percent translated for C3: 41.79
Translated for C6: 540
Skipped for C6 (already German or unchanged): 785
Percent translated for C6: 40.75


In [17]:
# STEP 2.7: SAVE TRANSLATED TEXT

pickle.dump(raw_text_de_c3, open("./processing/raw_text_de_c3.p", "wb"))
pickle.dump(raw_text_de_c6, open("./processing/raw_text_de_c6.p", "wb"))

In [2]:
# STEP 2.8: LOAD TRANSLATED TEXT

raw_text_de_c3 = pickle.load(open("./processing/raw_text_de_c3.p", "rb"))
raw_text_de_c6 = pickle.load(open("./processing/raw_text_de_c6.p", "rb"))

In [None]:
# STEP 2.9 TOKENISATION

# Load the spacy model
nlp_rq3 = spacy.load("de_core_news_sm")

# Create an empty list to store the tokens
doc_c3 = []
doc_c6 = []

# Tokenise the raw_text input
for string in raw_text_de_c3:
    doc_c3.extend(nlp_rq3(string))

for string in raw_text_de_c6:
    doc_c6.extend(nlp_rq3(string))

In [None]:
# STEP 2.10: FILTERING TOKENS (STOP WORDS ETC)

# Add words to stop list
nlp_rq3.vocab["Foto"].is_stop = True
nlp_rq3.vocab["foto"].is_stop = True
nlp_rq3.vocab["FOTO"].is_stop = True
nlp_rq3.vocab["Fotos"].is_stop = True
nlp_rq3.vocab["Photo"].is_stop = True
nlp_rq3.vocab["null"].is_stop = True
nlp_rq3.vocab["Waypoint"].is_stop = True

# Filter out tokens that are stop words (is_stop), puncutation (is_punct), 
# numbers (is_digit & like_num) OR spaces (is_space)
# option use token.lemma_ to extract the lemma for the final tokens
filtered_tokens_c3 = [token.text for token in doc_c3 if not token.is_stop | token.is_punct | 
                     token.is_digit | token.like_num | token.is_space]

filtered_tokens_c6 = [token.text for token in doc_c6 if not token.is_stop | token.is_punct | 
                     token.is_digit | token.like_num | token.is_space]

# Empty list for lower-case versions
filtered_tokens_lc_c3 = []
filtered_tokens_lc_c6 = []

# Convert to lower-case 
for token in filtered_tokens_c3:
    token_lc_c3 = token.lower()
    filtered_tokens_c3.append(token_lc_c3)

for token in filtered_tokens_c6:
    token_lc_c6 = token.lower()
    filtered_tokens_c6.append(token_lc_c6)

# Check
#print(filtered_tokens_c3, filtered_tokens_c6)

In [None]:
# STEP 2.11: CHECK SOME RESULTS :)

word_freq_c3 = Counter(filtered_tokens_c3)
common_words_c3 = word_freq_c3.most_common(20)

word_freq_c6 = Counter(filtered_tokens_c6)
common_words_c6 = word_freq_c6.most_common(20)


### Step 3: Pre-processing

Assign each token their cluster from rq2_step2_text_analysis.ipynb

In [None]:
# STEP 3: LOAD TOKEN/CLUSTER LOOKUP FROM RQ2

token_cluster_lookup = pickle.load(open("./processing/token_cluster_all.p", "rb"))