<a href="https://colab.research.google.com/github/mdb2000/UNFCCC-Human-mobility/blob/Python-codes/Dataset_sentence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import spacy
import pandas as pd
from tqdm import tqdm
import re
import nlp
folder= 'COP decisions'
abbr = 'COP'

In [None]:
# Increase the max_length limit
nlp.max_length = 1000000

In [None]:
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define the keywords to look for
keywords = {'migration', 'displacement', 'immigration', 'relocation', 'refugee', 'migrant'}

In [None]:
folder_path = f'C:\\Users\\3104470\\Desktop\\data\\text extraction\\{folder}'

In [None]:
# Function to process documents
def process_documents(folder_path, keywords):
    data = []
    doc_id = 1

    # Get list of files
    files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

    # Iterate over files with a progress bar
    for filename in tqdm(files, desc="Processing Documents"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            # Check if text length exceeds the max_length
            if len(text) > nlp.max_length:
                chunks = [text[i:i + nlp.max_length] for i in range(0, len(text), nlp.max_length)]
            else:
                chunks = [text]

            for chunk in chunks:
                doc = nlp(chunk)
                sentences = list(doc.sents)
                for i, sent in enumerate(sentences):
                    if any(keyword in sent.text for keyword in keywords):
                        # Include the previous and next sentence for context
                        prev_sent = sentences[i-1].text if i > 0 else ''
                        next_sent = sentences[i+1].text if i < len(sentences) - 1 else ''
                        combined_sent = f"{prev_sent} {sent.text.strip()} {next_sent}".strip()
                        doc_id_str = f"{doc_id}_{abbr}"
                        data.append({"Document_ID": doc_id_str, "Sentence": combined_sent})

        doc_id += 1

    return data

In [None]:
# Process the documents
processed_data = process_documents(folder_path, keywords)

Processing Documents: 100%|████████████████████████████████████████████████████████████| 28/28 [01:46<00:00,  3.80s/it]


In [None]:
# Create a DataFrame
df = pd.DataFrame(processed_data)

In [None]:
# Function to clean unnecessary spaces
def clean_sentence(sentence):
    # Remove URLs
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')
    sentence = re.sub(url_pattern, '', sentence)
    # Remove FCCC patterns with optional ".1"
    fccc_pattern = re.compile(r'\d*fccc/\S+/\d{4}/\d+(\.\d+)?')
    sentence = re.sub(fccc_pattern, '', sentence)
    # Remove encoded file patterns
    file_pattern = re.compile(r'\S+%20\S+')
    sentence = re.sub(file_pattern, '', sentence)
    # Remove other file patterns
    other_file_pattern = re.compile(r'\S+_\S+\.pdf|\S+/\S+\.pdf')
    sentence = re.sub(other_file_pattern, '', sentence)
    # Remove URL-like patterns
    url_like_pattern = re.compile(r'\S+\.\S+/\S+')
    sentence = re.sub(url_like_pattern, '', sentence)
    # Remove unnecessary spaces and dots
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    return sentence

In [None]:
# Remove specific unwanted characters
def remove_unwanted_characters(sentence):
    sentence = re.sub(r'[,.!?:;_\-/\\â€%&<>@#+*â€˜™“"ï]', '', sentence)
    # Remove any remaining non-ASCII characters
    sentence = re.sub(r'[^\x00-\x7F]+', '', sentence)
    return sentence

In [None]:
# Function to filter sentences with at least 32 characters
def filter_sentence(sentence):
    return len(sentence) >= 90

In [None]:
# Add Sentence_ID column
df['Sentence_ID'] = [f"{abbr}_sent_{i+1}" for i in range(len(df))]

In [None]:
# Apply cleaning to the 'Sentence' column
df['Sentence'] = df['Sentence'].apply(clean_sentence)
df['Sentence'] = df['Sentence'].apply(remove_unwanted_characters)


In [None]:
# Filter sentences with at least 32 characters
df = df[df['Sentence'].apply(filter_sentence)]

In [None]:
# Save the cleaned DataFrame to a CSV file
df.to_csv(f'C:/Users/3104470/Desktop/output/sentences datasets/df_clean_sentences_long_{folder}.csv', index=False)

In [None]:
#now filter for english and remove duplicates

In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
from nltk.tokenize import sent_tokenize


# Ensure consistent results from langdetect
DetectorFactory.seed = 0



# Define a function to detect if a sentence is in English
def is_english(Sentence):
    try:
        return detect(Sentence) == 'en'
    except:
        return False

# Filter sentences written in English
df['is_english'] = df['Sentence'].apply(is_english)
english_sentences = df[df['is_english']]

# Remove duplicates
unique_english_sentences = english_sentences.drop_duplicates(subset=['Sentence'])

# Drop the 'is_english' column as it is no longer needed
unique_english_sentences = unique_english_sentences.drop(columns=['is_english'])

# Save the filtered dataset
unique_english_sentences.to_csv(f'C:\\Users\\3104470\\Desktop\\output\\sentences datasets\\long version filtered eng&duplicates\\{folder}_filtered_dataset.csv', index=False)




In [None]:
#put everything inside one single dataset
# Define the folder path
folder_path = 'C:\\Users\\3104470\\Desktop\\output\\sentences datasets\\long version filtered eng&duplicates'

# Initialize an empty list to store individual dataframes
dataframes = []

# Loop through each file in the folder
first_file = True
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Ensure we only read CSV files
        file_path = os.path.join(folder_path, file_name)
        if first_file:
            df = pd.read_csv(file_path)
            first_file = False
        else:
            df = pd.read_csv(file_path, header=0)
        dataframes.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new CSV file
output_file_path = os.path.join(folder_path, 'combined_final_datasets.csv')
combined_df.to_csv(output_file_path, index=False)

print(f"Combined CSV file has been saved to {output_file_path}")


Combined CSV file has been saved to C:\Users\3104470\Desktop\output\sentences datasets\long version filtered eng&duplicates\combined_final_datasets.csv
