# Preprocessing - "The Office" dataset
This notebook aims to provide parameterizable functions to preprocess the "The Office" dataset for further NLP analysis. 

In [42]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, MWETokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import contractions

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


PATH = "../data/"
FILE = "the-office-lines_scripts.csv"

In [24]:
df = pd.read_csv(PATH+FILE, sep=",", index_col="id")

In [25]:
# concatenate line_text for each scene
def concatenate_scenes(df):
    df = df.groupby(["season", "episode", "scene"])["line_text"].apply(lambda x: " ".join(x)).reset_index()
    return df

In [26]:
def extract_directorals(df):
    # extract text from line_text in square brackets, put it in new column called "directionals", multiple square brackets will be extracted as a list
    df["directionals"] = df["line_text"].str.extractall(r"\[(.*?)\]").unstack().apply(lambda x: ", ".join(x.dropna()), axis=1)
    # delete the extracted text from line_text
    df["line_text"] = df["line_text"].str.replace(r"\[(.*?)\]", "", regex=True).str.strip()
    return df


In [27]:
# bare string preprocessing
def remove_punctuation(df):
    return df["line_text"].str.replace(r"[^\w\s]", "", regex=True)

def lower(df):
    return df["line_text"].apply(lambda x: x.lower())

def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))
    return df["line_text"].apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in stop_words]))

def expanding_contractions(df):
    return df["line_text"].apply(lambda x: contractions.fix(x))


In [87]:
def tokenize(df, tokenizer="TreeBankWord", tokenize_specialwords=True):
    if tokenizer=="TreeBankWord":
        t = nltk.tokenize.TreebankWordTokenizer()
    elif tokenizer=="WordPunct":
        t = nltk.tokenize.WordPunctTokenizer()
    elif tokenizer=="Whitespace":
        t = nltk.tokenize.WhitespaceTokenizer()
    else:
        raise ValueError(f"Tokenizer {tokenizer} does not exist.")

    tmp = df["line_text"].apply(lambda x: t.tokenize(x))

    if tokenize_specialwords:
        names = pd.read_csv(PATH+"character_names.csv", sep=";", encoding='cp1252').Character.values
        names = names.tolist()
        names.extend([name.lower() for name in names])
        with open(PATH+"compound_words_the-office_by_chatgpt.txt", "r") as f:
            compound_words = f.read().split(",")
        compound_words = [word.strip() for word in compound_words]
        compound_words.extend([w.lower() for w in compound_words])
        special_words = names + compound_words
        special_tokenizer = MWETokenizer([w.split(" ") for w in special_words])
        return tmp.apply(lambda x: special_tokenizer.tokenize(x))
    
    return tmp

def lemmatize(df):
    wordnet_lemmatizer = WordNetLemmatizer()
    # is working, but not very good results because of the simple speech of the characters
    return df["line_text"].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

def stem(df):
    porter_stemmer = PorterStemmer()
    return df["line_text"].apply(lambda x: " ".join([porter_stemmer.stem(word) for word in word_tokenize(x)]))

# tagging
def pos_tag(df):
    return df["line_text"].apply(lambda x: nltk.pos_tag(word_tokenize(x)))

In [89]:
def preprocess(
        df, 
        concat_scenes=False, 
        extract_direc=False, 
        remove_punct=False, 
        rmv_stopwords=False, 
        lwr=False, 
        exp_contractions=False, 
        conversion:str=None,
        tokenizer=(None, False) # parameter for tokenize function (tokenizer(string), tokenize_specialwords(bool)), only used if conversion is "tokenize"
        )->pd.DataFrame:
    if (concat_scenes):
        df = concatenate_scenes(df)
    if (extract_direc):
        df = extract_directorals(df)

    if (remove_punct):
        df['line_text'] = remove_punctuation(df)
    if (lwr):
        df['line_text'] = lower(df)
    if (rmv_stopwords):
        df['line_text'] = remove_stopwords(df)
    if (exp_contractions):
        df['line_text'] = expanding_contractions(df)   

    if (conversion == "tokenize"):
        df['line_text']  = tokenize(df, tokenizer[0], tokenizer[1])
    elif (conversion == "lemmatize"):
        df['line_text'] = lemmatize(df)
    elif (conversion == "stem"):
        df['line_text'] = stem(df)
    elif (conversion == "pos_tag"):
        df['line_text'] = pos_tag(df)


    return df
        

In [77]:
# Parameters
param_dict = {
    "concat_scenes": True,
    "extract_direc": True, 
    "remove_punct": True, 
    "rmv_stopwords": True,
    "lwr": True, 
    "exp_contractions": True,
    "conversion": "tokenize",
    "tokenizer": ("TreeBankWord", True)
}


In [90]:
preprocessed_df = preprocess(df, **param_dict)

pd.set_option("display.max_colwidth", None)
preprocessed_df

Unnamed: 0,season,episode,scene,line_text,directionals
0,1,1,1,"[right, jim, quarterlies, look, good, things, library, oh, told, could, close, come, master, guidance, saying, grasshopper, actually, called, yeah, right, well, let, show, done]",
1,1,1,2,"[phone, yes, id, like, speak, office, manager, please, yes, hello, michael_scott, regional_manager, dunder_mifflin, paper, products, wanted, talk, manageramanger, quick, cut, scene, right, done, deal, thank, much, sir, gentleman, scholar, oh, sorry, ok, sorry, mistake, hangs, woman, talking, low, voice, probably, smoker, clears, throat, way, done]",
2,1,1,3,"[uh, dunder_mifflin, 12, years, last, four, regional_manager, want, come, see, entire, floor, kingdom, far, eye, see, receptionist, pam, pam, pampam, pam_beesly, pam, us, forever, right, pam, well, know, think, cute, seen, couple, years, ago, growls, messages, uh, yeah, fax, oh, pam, corporate, many, times, told, special, filing, cabinet, things, corporate, told, called, wastepaper, basket, look, look, face]",
3,1,1,4,"[people, say, best, boss, go, god, never, worked, place, like, hilarious, get, best, us, shows, camera, worlds, best, boss, mug, think, pretty, much, sums, found, spencer, gifts]",
4,1,1,5,"[singing, shall, play, pa, rum, pump, um, pum, imitates, heavy, drumming, gifts, pa, rum, pump, um, pum, imitates, heavy, drumming]",
...,...,...,...,...,...
9156,9,23,112,"[seems, arbitrary, applied, job, company, hiring, took, desk, back, empty, chuckles, matter, get, end, human, beings, miraculous, gift, make, place, home, standing, two, cops, let]",
9157,9,23,113,"[feel, lucky, got, chance, share, crummy, story, anyone, thinks, one, take, dump, paper, shredder, alone, sister, let, get, beer, sometime]",
9158,9,23,114,"[happy, filmed, remember, everyone, worked, paper, company, years, never, wrote, anything]",
9159,9,23,115,"[sold, paper, company, 12, years, job, speak, clients, phone, quantities, types, copier, paper, even, love, every, minute, everything, owe, job, stupid, wonderful, boring, amazing, job]",


In [32]:
# feature extraction
def extract_features(df, vectorizer):
    if vectorizer == "binary":
        vectorizer = CountVectorizer(binary=True)
    elif vectorizer == "count":
        vectorizer = CountVectorizer() 
    elif vectorizer == "tfidf":
        vectorizer = TfidfVectorizer()
    elif vectorizer == "hashing":
        vectorizer = HashingVectorizer()

    result = vectorizer.fit_transform(df["line_text"])
    return result

def feature_selection (feature_df, selection_method):
    # TODO: add feature selection e.g. DF (document frequency)
    print("nothin here yet")

In [33]:
# feature extraction
param_dict = {
    "concat_scenes": False,
    "extract_direc": False, 
    "remove_punct": True, 
    "rmv_stopwords": False,
    "lwr": True, 
    "exp_contractions": True,
    "conversion": "lemmmatize"
}
test = preprocess(df, **param_dict)
feature_df = extract_features(test, "count")
feature_df.shape

(59911, 22850)

In [34]:
# save the preprocessed data
df.to_csv(PATH+"preprocessed_"+FILE, sep=",", index=True)
feature_df.to_csv(PATH+"feature_"+FILE, sep=",", index=True)

AttributeError: to_csv not found