# Preprocessing - "The Office" dataset
This notebook aims to provide parameterizable functions to preprocess the "The Office" dataset for further NLP analysis. 

In [1]:
import pandas as pd

PATH = "../data/"
FILE = "the-office-lines_scripts.csv"

In [2]:
df = pd.read_csv(PATH+FILE, sep=",", index_col="id")

In [3]:
# Parameters
param_dict = {
    "concat_scenes": True,
    "extract_direc": True, 
    "remove_punct": True, 
    "rmv_stopwords": True,
    "lwr": True, 
    
    "exp_contractions": True,
    "conversion": "tokenize",
    "tokenizer": ("TreeBankWord", True)
}


In [4]:
from preprocessing_nlp import preprocess

preprocessed_df = preprocess(df, **param_dict)

pd.set_option("display.max_colwidth", None)
preprocessed_df

Unnamed: 0,season,episode,scene,line_text,directionals
0,1,1,1,"[right, jim, quarterlies, look, good, things, library, oh, told, could, not, close, you, have, come, master, guidance, you, are, saying, grasshopper, actually, called, yeah, right, well, let, show, done]",
1,1,1,2,"[yes, id, like, speak, office, manager, please, yes, hello, michael_scott, regional_manager, dunder_mifflin, paper, products, wanted, talk, manageramanger, right, done, deal, thank, much, sir, you, are, gentleman, scholar, oh, i, am, sorry, ok, i, am, sorry, mistake, woman, talking, low, voice, probably, smoker, that, is, way, done]","on the phone, quick cut scene, hangs up, Clears throat"
2,1,1,3,"[i, have, uh, i, have, dunder_mifflin, 12, years, last, four, regional_manager, want, come, see, entire, floor, kingdom, far, eye, see, receptionist, pam, pam, pampam, pam_beesly, pam, us, forever, right, pam, well, do, not, know, think, she, is, cute, seen, couple, years, ago, messages, uh, yeah, fax, oh, pam, corporate, many, times, told, there, is, special, filing, cabinet, things, corporate, have, not, told, called, wastepaper, basket, look, look, face]",growls
3,1,1,4,"[people, say, best, boss, go, god, we, have, never, worked, place, like, you, are, hilarious, get, best, us, think, pretty, much, sums, found, spencer, gifts]",shows the camera his WORLD'S BEST BOSS mug
4,1,1,5,"[shall, play, pa, rum, pump, um, pum, gifts, pa, rum, pump, um, pum]","singing, Imitates heavy drumming, Imitates heavy drumming"
...,...,...,...,...,...
8844,9,23,112,"[seems, arbitrary, applied, job, company, hiring, took, desk, back, empty, matter, get, end, human, beings, miraculous, gift, make, place, home, let]","chuckles, standing with two cops"
8845,9,23,113,"[feel, lucky, got, chance, share, crummy, story, anyone, thinks, one, take, dump, paper, shredder, alone, sister, let, get, beer, sometime]",
8846,9,23,114,"[happy, filmed, remember, everyone, worked, paper, company, years, never, wrote, anything]",
8847,9,23,115,"[sold, paper, company, 12, years, job, speak, clients, phone, quantities, types, copier, paper, even, love, every, minute, everything, owe, job, stupid, wonderful, boring, amazing, job]",


In [5]:
from preprocessing_nlp import extract_features
# feature extraction
param_dict = {
    "concat_scenes": False,
    "extract_direc": False, 
    "remove_punct": True, 
    "rmv_stopwords": False,
    "lwr": True, 
    "exp_contractions": True,
    "conversion": "lemmmatize"
}
test = preprocess(df, **param_dict)
feature_df = extract_features(df, "count")
feature_df.shape

(59911, 20866)

In [6]:
# save the preprocessed data
# df.to_csv(PATH+"preprocessed_"+FILE, sep=",", index=True)
# feature_df.to_csv(PATH+"feature_"+FILE, sep=",", index=True)