# Text Preprocessing with NLTK

### Explore and Clean Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read the essays dataset
essays = pd.read_csv('sweep8_essays.csv')

In [4]:
# Quickly explore the data
essays.head()
essays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22146 entries, 0 to 22145
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          22146 non-null  object
 1   Essay Text  22146 non-null  object
dtypes: object(2)
memory usage: 346.2+ KB


Check for duplicates in the dataset

In [10]:
# Identify the column(s) with the observations
observation_columns = ['ID', 'Essay Text']  

# Check for duplicates
duplicates = essays.duplicated(subset=observation_columns)

# Determine the result
if duplicates.any():
    print("Duplicate observations found.")
else:
    print("No duplicate observations.")

No duplicate observations.


In [15]:
# Drop the duplicate observations
essays.drop_duplicates(subset=observation_columns, inplace=True)

# Save the modified data to a new CSV file
essays.to_csv('essays.csv', index=False)

In [16]:
essays.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14752 entries, 0 to 14762
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          14752 non-null  object
 1   Essay Text  14752 non-null  object
dtypes: object(2)
memory usage: 345.8+ KB


### Text Preprocessing

In [66]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ninabucekova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ninabucekova/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ninabucekova/nltk_data...


True

In [34]:
# Text of essays into string
essays["Essay Text"] = essays["Essay Text"].astype(str)

In [67]:
# Write functions for text preprocessing

def preprocess_text_column(csv_file, text_column, new_column):
    # Read the CSV file into a Pandas DataFrame
    df = pd.read_csv(csv_file)
    
    # Preprocess the text in the specified column
    df[new_column] = df[text_column].apply(preprocess_text)
    
    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_file, index=False)

    
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()

    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = stopwords.words('english')
    filtered_words = [word for word in tokens if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # Perform POS tagging
    pos_tags = pos_tag(filtered_words)

    # Return the preprocessed text
    return lemmatized_words

In [68]:
preprocess_text_column('essays.csv', 'Essay Text', 'Preprocessed Text')

In [69]:
essays_prep = pd.read_csv('essays.csv')
essays_prep.head()

Unnamed: 0,ID,Essay Text,Preprocessed Text
0,N28280Y,"I am happily married, we are grand-parents. Ou...","['happily', 'married', 'grandparent', 'two', '..."
1,N13960Q,"I am retired, not living in London, probably i...","['retired', 'living', 'london', 'probably', 'n..."
2,N23786Z,I imagine I'll still be teaching french at Pri...,"['imagine', 'ill', 'still', 'teaching', 'frenc..."
3,N17606R,I am retired from work. I enjoy leisurely time...,"['retired', 'work', 'enjoy', 'leisurely', 'tim..."
4,N19466F,"Retired and moved further away from London, Su...","['retired', 'moved', 'away', 'london', 'sussex..."


### Some Basic Text Statistics

In [80]:
# Perform basic text statistics on the 'text_column' column
text_essays = nltk.Text(essays_prep['Preprocessed Text'])
type(text_essays)

nltk.text.Text

In [74]:
# Most Frequent words
from nltk import FreqDist
freq_distribution = FreqDist(text_essays)
freq_distribution.most_common(20)

[('[]', 18),
 ("['comment']", 15),
 ("['idea']", 13),
 ("['dont', 'know']", 12),
 ("['much']", 10),
 ("['hope']", 6),
 ("['change']", 5),
 ("['enjoying', 'life']", 4),
 ("['much', '50']", 4),
 ("['hopefully', 'today']", 4),
 ("['response']", 4),
 ("['dont', 'want', 'think']", 4),
 ("['fortune', 'teller']", 4),
 ("['hopefully']", 4),
 ("['dont', 'think', 'ill', 'make', '60']", 4),
 ("['today']", 4),
 ("['happily', 'married', 'grandparent', 'two', 'daughter', 'happy', 'fulfilling', 'career', 'retirement', 'enjoying', 'self', 'ache', 'pain', 'exercising', 'taking', 'long', 'walk', 'enjoying', 'retirement', 'enjoying', 'hobby', 'staying', 'active']",
  2),
 ("['retired', 'living', 'london', 'probably', 'nottingham', 'partner', 'current', 'partner', 'health', 'quite', 'good', 'havent', 'succumbed', 'something', 'acute', 'illness', 'like', 'cancer', 'playing', 'piano', 'guitar', 'pleasure', 'perhaps', 'studying', 'mathematics', 'physic', 'still', 'money', 'worry', 'mainly', 'unfair', 'tax', 

In [79]:
# Finding collocations
text_essays.collocations(3)

['1', 'able', 'spend', 'time', 'hobby', 'le', 'time', 'work', '2',
'probably', 'starting', 'slow', 'due', 'health', '3', 'interested',
'keeping', 'happy', 'spending', 'much', 'time', 'possible', 'wife',
'4', 'trying', 'accomplish', 'thing', 'missed', 'bringing', 'child']
['wife', 'would', 'health', 'permitting', 'would', 'scratcing',
'living', 'work', 'normal', 'outlook', 'life', 'would', 'plot',
'land', 'lie', 'equal', 'ha', 'ha', 'nice', 'chit', 'chatting']; ['1',
'car', 'racing', 'motor', 'bike', 'animal', '2', 'wish', 'could',
'better', 'let', 'hope', '3', 'could', 'better'] ['work', 'parttime',
'spend', 'half', 'walking', 'husband', 'long', 'weekend', 'away',
'attending', 'course', 'art', 'ot', 'textile', 'interested',
'meeting', 'friend', 'visiting', 'daughter', 'home', 'healthy',
'age']; ['1', 'changed', 'direction', 'careerjob', 'maybe',
'finished', 'work', 'altogether', '2', 'continue',
'involvedinterested', 'church', 'religious', 'duty', '3', 'continue',
'icemake', 'cake', '4