In [1]:
import re
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv('Dataset_en.tsv', sep='\t')

#filter appropriate years
df = df[(df['year'] >= 1600) & (df['year'] <= 1980)]

In [4]:
#make the smell source lemma column a list of strings
df['source_list'] = df['Smell_Source_lemma'].str.split('|')
print(1)
df = df[df['source_list'].apply(lambda x: isinstance(x, list))]
print(2)
df = df[df['source_list'].apply(lambda x: all(isinstance(word, str) for word in x))]
print(3)
df['source_list'] = df['source_list'].apply(lambda x: [word.strip() for word in x])
print(4)
df.head()

1
2
3
4


Unnamed: 0.1,Unnamed: 0,Lang,year,Smell_Source_lemma,Quality_lemma,Sentence_lemma,Corpus,Smell_Source,Quality,Sentence,source_list
0,0,en,1940,hydrochloric acid,,"stomach normally acid reaction , putrefaction ...",medical-heritage,Hydrochloric acid,,"The stomach normally has an acid reaction , bu...",[hydrochloric acid]
1,1,en,1863,pepsine | motion,,"357 \ solid matter copious , pale , fetid , co...",medical-heritage,Pepsine|the motions,,"357 T \ here the solid matter is copious , pal...","[pepsine, motion]"
2,2,en,1894,riolous | perfon | principle,,thefe rather prove vifibi - lity animal vapour...,medical-heritage,riolous|Perfons|Principles,,But thefe rather prove the Vifibi - lity of th...,"[riolous, perfon, principle]"
3,3,en,1959,lov,,- ' cause great deal disturbance supply give r...,medical-heritage,lov,,T - ' is caused a great deal of disturbance to...,[lov]
4,4,en,1893,hawthorn | manual,,"268 .312 manual bagteriolooy odour hawthorn , ...",medical-heritage,Hawthorn|MANUAL,,268 .312 A MANUAL OF BAGTERIOLOOY the odour of...,"[hawthorn, manual]"


In [5]:
def preprocess_text(sentence):
    # Remove numbers, abbreviations, and unknown/special characters
    # Removes numbers
    sentence = re.sub(r'\d+', '', sentence)
    # Removes single characters and special characters
    sentence = re.sub(r'\W', ' ', sentence)
    # Removes single characters from the start
    sentence = re.sub(r'\^[a-zA-Z]\s+', ' ', sentence) 
    # Substitutes multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    # Removes prefixed 'b'
    sentence = re.sub(r'^b\s+', '', sentence)
    # Converts to lower case
    sentence = sentence.lower()
    return sentence

In [6]:
df['pre_text'] = df['Sentence'].apply(preprocess_text)

In [7]:
df[['pre_text', 'Sentence', 'year', 'Smell_Source', 'source_list']]

Unnamed: 0,pre_text,Sentence,year,Smell_Source,source_list
0,the stomach normally has an acid reaction but ...,"The stomach normally has an acid reaction , bu...",1940,Hydrochloric acid,[hydrochloric acid]
1,t here the solid matter is copious pale and f...,"357 T \ here the solid matter is copious , pal...",1863,Pepsine|the motions,"[pepsine, motion]"
2,but thefe rather prove the vifibi lity of the ...,But thefe rather prove the Vifibi - lity of th...,1894,riolous|Perfons|Principles,"[riolous, perfon, principle]"
3,t is caused a great deal of disturbance to the...,T - ' is caused a great deal of disturbance to...,1959,lov,[lov]
4,a manual of bagteriolooy the odour of hawthor...,268 .312 A MANUAL OF BAGTERIOLOOY the odour of...,1893,Hawthorn|MANUAL,"[hawthorn, manual]"
...,...,...,...,...,...
1360685,then pendulous like amethysts and rubies purpl...,"Then pendulous , like amethysts And rubies , p...",1917,goldenrods|blossoms of the haw|calmias,"[goldenrod, blossom haw, calmias]"
1360686,no doubt in time he would prefer it but he wis...,No doubt in time he would prefer it ; but he w...,1892,bygone cabbage|kindred,"[bygone cabbage, kindre]"
1360688,then at an odor stealing inward through the ke...,"Then , at an odor stealing inward through the ...",1923,Ether|of,"[ether, ]"
1360691,thereafter the giantesses headed by kuntî said...,Thereafter the giantesses headed by Kuntî said...,1884,Garland|Flower|Powder|Cloth|Ointment|Incense|J...,"[garland, flower, powder, cloth, ointment, inc..."


In [28]:
def filter_df_by_sources(source_lists, match_sources):
    match_sources_set = set(match_sources)  # Convert list to set for faster operations
    # Check if any item in the row's source list is in the match sources
    return [any(item in match_sources_set for item in source_list) for source_list in source_lists]

bloemen = ['flower', 'flowers']
df['bloem'] = filter_df_by_sources(df['source_list'], bloemen)
flower = df[df['bloem']]
flower

Unnamed: 0.1,Unnamed: 0,Lang,year,Smell_Source_lemma,Quality_lemma,Sentence_lemma,Corpus,Smell_Source,Quality,Sentence,source_list,pre_text,bloem
72,72,en,1927,inflammable air | fcecal matter | flower,dangerous | unnatural | agreeable | noisome | ...,â   head flower send we contain twenty - thr...,medical-heritage,inflammable air|the fcecal matter|Flower,dangerous|unnatural|more agreeable|noisome|var...,â   The head of flowers sent us contained tw...,"[inflammable air, fcecal matter, flower]",â the head of flowers sent us contained twenty...,True
73,73,en,1927,inflammable air | fcecal matter | flower,dangerous | unnatural | agreeable | noisome | ...,"month .after sea , itlie bung take cask , send...",medical-heritage,inflammable air|the fcecal matter|Flower,dangerous|unnatural|more agreeable|noisome|var...,"In about a month .after it has been at sea , w...","[inflammable air, fcecal matter, flower]",in about a month after it has been at sea when...,True
74,74,en,1927,inflammable air | fcecal matter | flower,dangerous | unnatural | agreeable | noisome | ...,"perfectly healthy date digedive organ , probab...",medical-heritage,inflammable air|the fcecal matter|Flower,dangerous|unnatural|more agreeable|noisome|var...,In a perfectly healthy date of the digedive or...,"[inflammable air, fcecal matter, flower]",in a perfectly healthy date of the digedive or...,True
75,75,en,1927,inflammable air | fcecal matter | flower,dangerous | unnatural | agreeable | noisome | ...,) strasburg turpentine .this resin geneâ ¬ ral...,medical-heritage,inflammable air|the fcecal matter|Flower,dangerous|unnatural|more agreeable|noisome|var...,) Strasburg Turpentine .This resin is geneÂ ¬ ...,"[inflammable air, fcecal matter, flower]",strasburg turpentine this resin is geneâ rall...,True
81,81,en,1855,flower,,"western side strait , â   vegetation stunt ,...",medical-heritage,Flower,,"On the western side of the Straits , â   veg...",[flower],on the western side of the straits â vegetatio...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360656,80502,en,1919,flower,odorous | spiky | miasmic,"feel journey , back security coast ragged unio...",wikisource,the very flowers,odorous|spiky|miasmic,"But what they had felt on that journey , back ...",[flower],but what they had felt on that journey back to...,True
1360661,80507,en,1924,flower | shiraz wine,scented | flower | golden,porter pass singly twos three .the last tall n...,wikisource,Flower|Shiraz wine,scented|flower|golden,The porters passed in singly and by twos and t...,"[flower, shiraz wine]",the porters passed in singly and by twos and t...,True
1360667,80513,en,1919,flower |,lovely | fragrant,", value married success happiness...",wikisource,Flower|they,lovely|fragrant,""" Don ' t do that , as you value your married ...","[flower, ]",don t do that as you value your married succe...,True
1360691,80537,en,1884,garland | flower | powder | cloth | ointment |...,lotus - scented | vârshika - scented | - scent...,thereafter giantess head kuntî say unto lord :...,wikisource,Garland|Flower|Powder|Cloth|Ointment|Incense|J...,lotus - scented|Vârshika - scented|- scente...,Thereafter the giantesses headed by Kuntî said...,"[garland, flower, powder, cloth, ointment, inc...",thereafter the giantesses headed by kuntî said...,True


In [31]:
import pandas as pd

random_rows = flower.sample(n=3)

# Print the entire sentence for each randomly selected row
for index, row in random_rows.iterrows():
    print(row['Sentence'])


â   42 .back , with acute angular margins , pale green , sometimes rugged , with rust - coloured dots , sometimes without , and quite smooth , from an inch and a half to six inches in length ; scape round , slender , from a foot to two feet in length ; flowers sweet - smelling , elegantly variegated with purple , violet , yel - low , and red ; petals four .â   Native of the woods in Jamaica and Hispaniola .
Sympathy Must call her in Love ' s name ! and then , I know , She rises up , and brightens , as she should , And lights her smile for comfort , and is slow In nothing of high -  hearted fortitude .To smell this flower , come near it : such can grow In that sole garden where Christ ' s brow dropped blood .  66 SONNETS .
Yet let us take courage , illustrious nobles and chieftains , true friends and loyal subjects , — let us aspire to that heaven , where all is 109 Aztec Civilisation Various musical instruments were placed on the top of the tower , and the sound of them , accompani

In [30]:
flower[['pre_text', 'Sentence', 'year', 'Smell_Source', 'source_list']]

Unnamed: 0,pre_text,Sentence,year,Smell_Source,source_list
72,â the head of flowers sent us contained twenty...,â   The head of flowers sent us contained tw...,1927,inflammable air|the fcecal matter|Flower,"[inflammable air, fcecal matter, flower]"
73,in about a month after it has been at sea when...,"In about a month .after it has been at sea , w...",1927,inflammable air|the fcecal matter|Flower,"[inflammable air, fcecal matter, flower]"
74,in a perfectly healthy date of the digedive or...,In a perfectly healthy date of the digedive or...,1927,inflammable air|the fcecal matter|Flower,"[inflammable air, fcecal matter, flower]"
75,strasburg turpentine this resin is geneâ rall...,) Strasburg Turpentine .This resin is geneÂ ¬ ...,1927,inflammable air|the fcecal matter|Flower,"[inflammable air, fcecal matter, flower]"
81,on the western side of the straits â vegetatio...,"On the western side of the Straits , â   veg...",1855,Flower,[flower]
...,...,...,...,...,...
1360656,but what they had felt on that journey back to...,"But what they had felt on that journey , back ...",1919,the very flowers,[flower]
1360661,the porters passed in singly and by twos and t...,The porters passed in singly and by twos and t...,1924,Flower|Shiraz wine,"[flower, shiraz wine]"
1360667,don t do that as you value your married succe...,""" Don ' t do that , as you value your married ...",1919,Flower|they,"[flower, ]"
1360691,thereafter the giantesses headed by kuntî said...,Thereafter the giantesses headed by Kuntî said...,1884,Garland|Flower|Powder|Cloth|Ointment|Incense|J...,"[garland, flower, powder, cloth, ointment, inc..."


In [32]:
flower.to_csv('flowers.csv')