In [11]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

data = pd.read_csv("/Users/Magnus/Documents/GitHub/Merge/Journal_merged_v1_11-8.csv")


stop_words = set(stopwords.words('english'))


def basic_tokenizer(text):
    """Basic word tokenizer using regular expressions."""
    return re.findall(r'\b\w+\b', text)


def remove_names_v2(text):
    """Improved function to remove words that start with a capital letter, with exception for the first word of a sentence."""
    # Split text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    
    cleaned_sentences = []
    for sentence in sentences:
        # Tokenize each sentence into words
        words = sentence.split()
        if words:  # Check if the sentence is not empty
            # Keep the first word and check the rest for capitalization
            cleaned_words = [words[0]] + [word for word in words[1:] if not word[0].isupper()]
            cleaned_sentences.append(' '.join(cleaned_words))
    
    return ' '.join(cleaned_sentences)

def clean_text_with_name_removal_v2(text):
    # Remove names
    text = remove_names_v2(text)
    
    # Remove the prefix "1. Introduction"
    text = re.sub(r'^1\. Introduction', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text
    tokens = basic_tokenizer(text)
    
      # Remove stopwords and return the cleaned text
    return ' '.join([word for word in tokens if word not in stop_words])

# Apply the improved cleaning function to the "Introduction" column
data["Cleaned_Introduction"] = data["Introduction"].apply(clean_text_with_name_removal_v2)

# Display the first few rows of the new cleaned column
data["Cleaned_Introduction"].head()

data.to_csv("/Users/Magnus/Documents/GitHub/Merge/Clean_Journal_merged_v1_11-8.csv", index=False)


In [10]:
print(data)

                                            Introduction Article_Gender  \
0      1. IntroductionThe all-or-nothing mechanism is...           Male   
1      1. IntroductionA bilateral exchange rate is th...           Male   
2      1. IntroductionPerformance-contingent bonuses ...           Male   
3      1. IntroductionA series of theoretical models ...           Male   
4      1. IntroductionMeasuring competitiveness is an...           Male   
...                                                  ...            ...   
14297  A trip chain is a series of trips taken in whi...           Male   
14298  Reliability of public transport (PT) operation...           Male   
14299  Since the advent of the four-step transport pl...           Male   
14300  A recent societal trend that made its way into...           Male   
14301  Urban rail transit systems such as metro is ha...           Male   

       gen_dummy  Journal Year       Journal Name  Citations  \
0              0  October 2023  Eco