In [2]:
import pandas as pd

def load_and_check_data(file_path: str, text_column: str):
    # Load the CSV file
    try:
        df = pd.read_csv(file_path)
        print("CSV file loaded successfully.")
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return

    # Check if the required columns are in the dataframe
    if text_column not in df.columns or 'year' not in df.columns:
        print(f"Error: Required columns '{text_column}' or 'year' not found in the file.")
        return

    # Check data types
    text_column_dtype = df[text_column].dtype
    year_dtype = df['year'].dtype

    print(f"Data types - {text_column}: {text_column_dtype}, year: {year_dtype}")

    # Process the data
    df = df.dropna(subset=[text_column, 'year'])
    print(f"Dataframe shape after dropping NaNs: {df.shape}")

    try:
        df[text_column] = df[text_column].astype(str).str.lower()
        df['year'] = df['year'].astype(int)
        print("Columns converted to correct types.")
    except Exception as e:
        print(f"Error converting column types: {e}")
        return

    try:
        df = df.explode(text_column).sort_values("year")
        docs = df[text_column].tolist()
        timestamps = df['year'].tolist()
        print(f"Docs length: {len(docs)}, Timestamps length: {len(timestamps)}")
        print(f"First few docs: {docs[:5]}")
        print(f"First few timestamps: {timestamps[:5]}")
    except Exception as e:
        print(f"Error processing dataframe: {e}")
        return

# Path to the files
data_flower_path = 'full_NL.csv'
data_t_flower_path = 'translated_sentences_10k_NL.csv'

# Run the function for both files
print("Checking data_flower:")
load_and_check_data(data_flower_path, 'pre_text')

print("\nChecking data_t_flower:")
load_and_check_data(data_t_flower_path, 'modern_sentences')


Checking data_flower:
CSV file loaded successfully.
Data types - pre_text: object, year: int64
Dataframe shape after dropping NaNs: (46236, 2)
Columns converted to correct types.
Docs length: 46236, Timestamps length: 46236
First few docs: ['yet it is almost impossible to finde one like another and though it happen somtimes that one resem ble another yet there neuer wanteth some difference of di uersity besides this behold the difference of trees plants hearbs and flowers which in each countrey groweth with such diuersity of colour tast smell property and vertue and if these things because we see them daily with our eyes and handle them with our hands as thinges common doe not a maze vs why should wee then so much wonder in seeing some things which passe this common agreement and order of nature which for all that doe not exceede nature neither are vnnaturall though the conceite thereof passe the gros nes of our reach and vnderstanding ', ' the nimphes meeting their may queene entertai

In [15]:
import pandas as pd

# Load the CSV files
full_nl_path = 'full_flowers.csv'
translated_nl_path = 'translated_sentences_10k_EN.csv'

full_nl_df = pd.read_csv(full_nl_path)
translated_nl_df = pd.read_csv(translated_nl_path)

# Display the first few rows and the data types of the columns for both dataframes
print("Full NL DataFrame Info:")
print(full_nl_df.info())
print(full_nl_df.head())

print("\nTranslated NL DataFrame Info:")
print(translated_nl_df.info())
print(translated_nl_df.head())

# Check for missing values
print("\nMissing values in Full NL DataFrame:")
print(full_nl_df.isna().sum())

print("\nMissing values in Translated NL DataFrame:")
print(translated_nl_df.isna().sum())

Full NL DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46236 entries, 0 to 46235
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      46236 non-null  int64 
 1   pre_text  46236 non-null  object
dtypes: int64(1), object(1)
memory usage: 722.6+ KB
None
   year                                           pre_text
0  1927  â the head of flowers sent us contained twenty...
1  1927  in about a month after it has been at sea when...
2  1927  in a perfectly healthy date of the digedive or...
3  1927   strasburg turpentine this resin is geneâ rall...
4  1855  on the western side of the straits â vegetatio...

Translated NL DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0.1        10000 non-null  int64 
 1   Unnamed: 0       

In [16]:
tdf = pd.read_csv(translated_nl_path)
tdf.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Lang,year,Smell_Source_lemma,Quality_lemma,Sentence_lemma,Corpus,Smell_Source,Quality,Sentence,source_list,pre_text,bloem,modern_sentences
0,517426,517426,en,1784,flower |,pleafant,` ` juice faid purge apply externally belly oi...,medical-heritage,Flower|they,very pleafant,The juice is faid to purge when applied extern...,"['flower', '']",the juice is faid to purge when applied extern...,True,The juice is said to cleanse when applied exte...
1,26911,26911,en,1949,fruit | flower,fragrant,"neatness , freshness , entire abâ ¬ sence ever...",medical-heritage,Fruit|Flower,fragrant,"The neatness , freshness , and entire abÂ ¬ se...","['fruit', 'flower']",the neatness freshness and entire abâ sence of...,True,"The cleanliness, freshness, and complete absen..."
2,451441,451441,en,1893,flower | book | slipper | holder | needle | st...,,"ilkley depository establish january , 1835 , p...",medical-heritage,Flower|Book|slippers|holders|needle|stands|baby,,The Ilkley Depository was established in Janua...,"['flower', 'book', 'slipper', 'holder', 'needl...",the ilkley depository was established in janua...,True,The Ilkley Depository was established in Janua...
3,1070334,109368,en,1888,flower | male - fern,scented | sweet,"beech , horse - chestnut , scotch fir , alder ...",british-library,Flower|male - ferns,scented|sweet,"Beeches , horse - chestnuts , Scotch firs , al...","['flower', 'male - fern']",beeches horse chestnuts scotch firs alders syc...,True,"Beeches, horse chestnuts, Scotch firs, alders,..."
4,964196,3230,en,1895,| | wicker | wicker interstice | interstice | ...,mingle | vitriolic | dreadful | noxious | perf...,basket contain square package carefully cord h...,british-library,through|through the|the wicker|wicker intersti...,mingled|vitriolic|dreadful|noxious|perfumed|fr...,Each basket contained a square package of care...,"['', '', 'wicker', 'wicker interstice', 'inter...",each basket contained a square package of care...,True,Each basket contained a square package careful...


In [17]:
def preprocess_text(sentence):
    if not isinstance(sentence, str):
        return ""
    # Remove numbers, abbreviations, and unknown/special characters
    # Removes numbers
    sentence = re.sub(r'\d+', '', sentence)
    # Removes single characters and special characters
    sentence = re.sub(r'\W', ' ', sentence)
    # Removes single characters from the start
    sentence = re.sub(r'\^[a-zA-Z]\s+', ' ', sentence) 
    # Substitutes multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    # Removes prefixed 'b'
    sentence = re.sub(r'^b\s+', '', sentence)
    # Converts to lower case
    sentence = sentence.lower()

    # Tokenization
    #tokens = word_tokenize(sentence)

    # Remove stopwords, and perform stemming and lemmatization
    #stemmer = PorterStemmer()
    #lemmatizer = WordNetLemmatizer()
    #stopwords_list = set(stopwords.words('english'))
    #processed_tokens = [word for word in tokens if word not in stopwords_list and len(word) > 1]
    
    return sentence

In [18]:
import re
import pandas as pd
tdf['pre_mod'] = tdf['modern_sentences'].apply(preprocess_text)

In [19]:
tdf.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Lang,year,Smell_Source_lemma,Quality_lemma,Sentence_lemma,Corpus,Smell_Source,Quality,Sentence,source_list,pre_text,bloem,modern_sentences,pre_mod
0,517426,517426,en,1784,flower |,pleafant,` ` juice faid purge apply externally belly oi...,medical-heritage,Flower|they,very pleafant,The juice is faid to purge when applied extern...,"['flower', '']",the juice is faid to purge when applied extern...,True,The juice is said to cleanse when applied exte...,the juice is said to cleanse when applied exte...
1,26911,26911,en,1949,fruit | flower,fragrant,"neatness , freshness , entire abâ ¬ sence ever...",medical-heritage,Fruit|Flower,fragrant,"The neatness , freshness , and entire abÂ ¬ se...","['fruit', 'flower']",the neatness freshness and entire abâ sence of...,True,"The cleanliness, freshness, and complete absen...",the cleanliness freshness and complete absence...
2,451441,451441,en,1893,flower | book | slipper | holder | needle | st...,,"ilkley depository establish january , 1835 , p...",medical-heritage,Flower|Book|slippers|holders|needle|stands|baby,,The Ilkley Depository was established in Janua...,"['flower', 'book', 'slipper', 'holder', 'needl...",the ilkley depository was established in janua...,True,The Ilkley Depository was established in Janua...,the ilkley depository was established in janua...
3,1070334,109368,en,1888,flower | male - fern,scented | sweet,"beech , horse - chestnut , scotch fir , alder ...",british-library,Flower|male - ferns,scented|sweet,"Beeches , horse - chestnuts , Scotch firs , al...","['flower', 'male - fern']",beeches horse chestnuts scotch firs alders syc...,True,"Beeches, horse chestnuts, Scotch firs, alders,...",beeches horse chestnuts scotch firs alders syc...
4,964196,3230,en,1895,| | wicker | wicker interstice | interstice | ...,mingle | vitriolic | dreadful | noxious | perf...,basket contain square package carefully cord h...,british-library,through|through the|the wicker|wicker intersti...,mingled|vitriolic|dreadful|noxious|perfumed|fr...,Each basket contained a square package of care...,"['', '', 'wicker', 'wicker interstice', 'inter...",each basket contained a square package of care...,True,Each basket contained a square package careful...,each basket contained a square package careful...


In [20]:
# Filter instances where pre_mod is an empty string
empty_pre_mod_instances = tdf[tdf['pre_mod'] == ""]

# Display the results
print(empty_pre_mod_instances)

Empty DataFrame
Columns: [Unnamed: 0.1, Unnamed: 0, Lang, year, Smell_Source_lemma, Quality_lemma, Sentence_lemma, Corpus, Smell_Source, Quality, Sentence, source_list, pre_text, bloem, modern_sentences, pre_mod]
Index: []


In [21]:
# Remove instances where pre_mod is an empty string
filtered_tdf = tdf[tdf['pre_mod'] != ""]
filtered_tdf.to_csv('10k_EN_mod.csv', index=False)