In [2]:
import pandas as pd
import csv
import os

In [24]:
# Read in the data
with open("text-variants/01 Objects.txt", "r") as file:
    objects = file.readlines()

with open("text-variants/02 Food.txt", "r") as file:
    food = file.readlines()

with open("text-variants/03 Rooms.txt", "r") as file:
    rooms = file.readlines()

In [55]:
# Prepare dataframe for transforming txt files into csv
df = pd.DataFrame(columns=["category", "title", "portrait"])

In [56]:
# Read txts, and format into dataframe. Categories are objects, food, and rooms. Titles appear in the txts as lines in all caps. Portraits are the lines that follow the titles. Combine portrait lines into one string per title.
def read_txt(txt, category):
    title = ""
    portrait = ""
    title_counts = {}
    for line in txt:
        line = line.strip()
        if line.isupper():
            if title != "":
                df.loc[len(df)] = [category, title, portrait]
            title = line
            if title in title_counts:
                title_counts[title] += 1
                title = f"{title} [{title_counts[title]}]"
            else:
                title_counts[title] = 1
            portrait = ""
        else:
            if portrait != "":
                portrait += "\n"
            portrait += line
    if title != "":
        df.loc[len(df)] = [category, title, portrait]
            
read_txt(objects, "objects")
read_txt(food, "food")
read_txt(rooms, "rooms")

In [57]:
df

Unnamed: 0,category,title,portrait
0,objects,"A CARAFE, THAT IS A BLIND GLASS.","A kind in glass and a cousin, a spectacle and ..."
1,objects,GLAZED GLITTER.,"Nickel, what is nickel, it is originally rid o..."
2,objects,A SUBSTANCE IN A CUSHION.,The change of color is likely and a difference...
3,objects,A BOX.,Out of kindness comes redness and out of ruden...
4,objects,A PIECE OF COFFEE.,More of double.\n\nA place in no new table.\n\...
...,...,...,...
105,food,ORANGE IN.,Go lack go lack use to her.\n\nCocoa and clear...
106,food,SALAD DRESSING AND AN ARTICHOKE.,"Please pale hot, please cover rose, please acr..."
107,food,SALAD DRESSING AND AN ARTICHOKE. [2],It was please it was please carriage cup in an...
108,food,A CENTRE IN A TABLE.,"It was a way a day, this made some sum. Suppos..."


In [28]:
# Write dataframe to csv
df.to_csv("list.csv", index=False)

In [30]:
%pip install nltk contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-macosx_10_9_universal2.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24
Note: you may need to restart the kernel to use updated packages.


In [58]:
# Now we are going to create some word lists.
# We need to clean the text first, removing punctuation and making everything lowercase.
import nltk
import re
import contractions
from nltk.tokenize import word_tokenize

df_clean = df

# Define a function to expand contractions
def expand_contractions(text):
    return contractions.fix(text)

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_words(text):
    # Convert to lowercase
    text = text.lower()

    # Expand contractions
    text = expand_contractions(text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    return words

# Apply the preprocessing to the 'title' and 'portrait' columns
df_clean['title'] = df_clean['title'].apply(preprocess_words)
df_clean['portrait'] = df_clean['portrait'].apply(preprocess_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mderdun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mderdun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/mderdun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mderdun/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/mderdun/nltk_data...


AttributeError: 'list' object has no attribute 'lower'

In [59]:
df_clean

Unnamed: 0,category,title,portrait
0,objects,"[carafe, blind, glass]","[kind, glass, cousin, spectacle, nothing, stra..."
1,objects,"[glazed, glitter]","[nickel, nickel, originally, rid, cover, chang..."
2,objects,"[substance, cushion]","[change, color, likely, difference, little, di..."
3,objects,[box],"[kindness, comes, redness, rudeness, comes, ra..."
4,objects,"[piece, coffee]","[double, place, new, table, single, image, spl..."
...,...,...,...
105,food,[orange],"[go, lack, go, lack, use, cocoa, clear, soup, ..."
106,food,"[salad, dressing, artichoke]","[please, pale, hot, please, cover, rose, pleas..."
107,food,"[salad, dressing, artichoke, 2]","[please, please, carriage, cup, icecream, icec..."
108,food,"[centre, table]","[way, day, made, sum, suppose, cod, liver, cod..."


In [61]:
# Initialize a list to store the data
data = []

# Iterate over the existing DataFrame
for index, row in df_clean.iterrows():
    title_words = row['title']
    portrait_words = row['portrait']

    # Add title words to the data list
    for word in title_words:
        key = f"{index}to"  # Create the key
        if any(d['word'] == word for d in data):
            for d in data:
                if d['word'] == word:
                    if key not in d['appears_in']:
                        d['appears_in'].append(key)
        else:
            data.append({'word': word, 'appears_in': [key]})

    # Add portrait words to the data list
    for word in portrait_words:
        key = f"{index}po"  # Create the key
        if any(d['word'] == word for d in data):
            for d in data:
                if d['word'] == word:
                    if key not in d['appears_in']:
                        d['appears_in'].append(key)
        else:
            data.append({'word': word, 'appears_in': [key]})

# Convert the data list to a DataFrame
word_df = pd.DataFrame(data)

In [62]:
word_df

Unnamed: 0,word,appears_in
0,carafe,[0to]
1,blind,"[0to, 26po, 61po, 109po]"
2,glass,"[0to, 0po, 11po, 47po, 59po, 80po, 109po]"
3,kind,"[0po, 2po, 11po, 24po, 38po, 54po, 58po, 60po,..."
4,cousin,[0po]
...,...,...
2511,breathing,[109po]
2512,incredible,[109po]
2513,justice,[109po]
2514,magnificent,[109po]


In [63]:
# Write the word DataFrame to a CSV file
word_df.to_csv("words.csv", index=False)

In [64]:
# Copy all the words to clipboard
word_df['word'].to_clipboard(index=False, header=False)

In [68]:
# Extract all the unique words in square brackets from word_edit.txt.
# This is a list of words in capital letters that are not in the dictionary, followed by words in square brackets that are in the dictionary.
with open('word_edit.txt', 'r') as file:
    content = file.read()

lowercase_content = content.lower()

with open('word_edit.txt', 'w') as file:
    file.write(lowercase_content)

In [71]:
import pandas as pd
import re

# Read the words.csv file into a DataFrame
df = pd.read_csv('words.csv')

# Open word_edit.txt and process each line
with open('word_edit.txt', 'r') as file:
    for line in file:
        # Search for the unbracketed word and the bracketed word
        match = re.search(r'(\w+)\s*\[(\w+)\]', line)

        # If a match was found
        if match:
            unbracketed_word, bracketed_word = match.groups()

            # Find the row in the DataFrame that matches the unbracketed word
            df.loc[df['word'] == unbracketed_word, 'word'] += f' [{bracketed_word}]'

# Save the updated DataFrame back to words.csv
df.to_csv('words.csv', index=False)

In [73]:
# Load the CSV file into a DataFrame
sm_norms = pd.read_csv('sensorimotor_norms.csv')
words = pd.read_csv('words.csv')

# Convert all string data to lowercase
sm_norms = sm_norms.applymap(lambda s: s.lower() if type(s) == str else s)

In [75]:
# Save the updated DataFrame back to sensorimotor_norms.csv
sm_norms.to_csv('sensorimotor_norms.csv', index=False)

In [76]:
# Iterate over the 'Word' column in sm_norms
for index, row in sm_norms.iterrows():
    word = row['Word']
    # Get all columns other than 'Word'
    info_cols = row[sm_norms.columns.difference(['Word'])]

    # Find the corresponding word in the 'words' DataFrame
    # This could be either the unbracketed word or the word in square brackets
    mask = words['word'].str.contains(rf'\b{word}\b', regex=True)

    # Append all the information columns in sm_norms to the corresponding row in words
    for col in info_cols.index:
        if col not in words.columns:
            words[col] = None
        words.loc[mask, col] = info_cols[col]

In [80]:
# Save the updated DataFrame to new words_with_sm_norms.csv
words.to_csv('words_with_sm_norms.csv', index=False)