## The CBS Secret Sauce Investigation: NCIS:NLP -- Scraping & Cleaning

#### Creator: Mitch Brinkman

In [1]:
from bs4 import BeautifulSoup
import requests

In [16]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import patsy
import re
import pickle
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer 
from nltk.util import ngrams
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
import sys
sys.setrecursionlimit(20000)

In [31]:
# from cbs_func import NLPProcessor

ImportError: cannot import name 'NLPProcessor' from 'cbs_func' (/Users/user/Desktop/notes_chi20_ds14/CBS_TV_project4/cbs_tv_nlp_proj4/cbs_func.py)

In [6]:
from cbs_func import make_ep_list
from cbs_func import clean_tv_scripts
from cbs_func import ncis_data
from cbs_func import shield_data
from cbs_func import clean_sentiment

### Web Scraping

In [None]:
url_list = ['https://subslikescript.com/series/NCIS-364845','https://subslikescript.com/series/The_Shield-286486']

#### NCIS scripts

In [None]:
ncis_list = make_ep_list(url_list[0])

In [None]:
for i, num in enumerate(ncis_list):
    ncis_list[i] = num.replace('/series/NCIS-364845/','')

In [None]:
ncis_df = ncis_data(ncis_list)

In [None]:
ncis_df.set_index('ep_id',inplace=True)
ncis_df = ncis_df.astype(str)

In [None]:
pickle.dump(ncis_df, open("./data/raw/ncis_series.pkl", "wb"))

#### The Shield scripts

In [None]:
shield_list = make_ep_list(url_list[1])

In [None]:
for i, num in enumerate(shield_list):
    shield_list[i] = num.replace('/series/The_Shield-286486/','')

In [None]:
shield_df = shield_data(shield_list)

In [None]:
shield_df.set_index('ep_id',inplace=True)
shield_df = shield_df.astype(str)

In [None]:
pickle.dump(shield_df, open("./data/raw/shield_series.pkl", "wb"))

### Cleaning

In [None]:
add_stop_words = ['tony','abby','dinozzo','uh','um']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [None]:
def clean_tv_scripts(text):
    '''Make text lowercase, remove text in square brackets, 
    remove punctuation and remove words containing numbers.
    '''
    text = re.sub('([A-Z]{2,})', ' ', text)
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('\<.*?>', ' ', text)
    text = re.sub('\(.*?\)', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('♪', '', text)
    text = re.sub('  ', ' ', text)
    text = re.sub('   ', ' ', text)
    text = re.sub('    ', ' ', text)
    return text

big_wash = lambda x: clean_tv_scripts(x)

#### Corpus Prep for Sentiment Analysis

In [7]:
# # Pickle reload of raw file, for easy drop-in to notebook

ncis_df = pd.read_pickle('./data/raw/ncis_series.pkl')
shield_df = pd.read_pickle('./data/raw/shield_series.pkl')

In [8]:
#index reset and str conversion if necessary

# ncis_df.set_index('ep_id',inplace=True)
# shield_df.set_index('ep_id',inplace=True)
# ncis_df = ncis_df.astype(str)
# shield_df = shield_df.astype(str)

In [9]:
# creating a simple clean function referencing sentiment clean process
first_wash = lambda x: clean_sentiment(x)

In [10]:
clean_ncis = pd.DataFrame(ncis_df.dialogue.apply(first_wash))
clean_shield = pd.DataFrame(shield_df.dialogue.apply(first_wash))

In [12]:
pickle.dump(clean_shield, open("./data/processed/shield_sentiment.pkl", "wb"))
pickle.dump(clean_ncis, open("./data/processed/ncis_sentiment.pkl", "wb"))

#### NLP Pipeline Processing

In [None]:
# # Pickle reload of raw file, for easy drop-in to notebook

ncis_df = pd.read_pickle('./data/raw/ncis_series.pkl')
shield_df = pd.read_pickle('./data/raw/shield_series.pkl')

In [None]:
ncis_df.head()

In [18]:
class NLPProcessor:
    
    def __init__(self, vectorizer_class, tokenizer_function, cleaning_function, lemmer_function):
        self.vectorizer = vectorizer_class
        self.tokenizer = tokenizer_function
        self.cleaning_function = cleaning_function
        self.lemmer = lemmer_function
    
    def fit(self, corpus_list_to_fit):
        cleaned_corpus = list(map(self.cleaning_function, corpus_list_to_fit))
#         print(cleaned_corpus)
        tokenized_list = list(map(self.tokenizer, cleaned_corpus))
#         print(tokenized_list)
        lemmed_list = [' '.join(list(map(self.lemmer, item))) for item in tokenized_list]
#         print(lemmed_list)
        return self.vectorizer.fit(lemmed_list)
    
    def transform(self, corpus_list_to_clean):
        cleaned_corpus = list(map(self.cleaning_function, corpus_list_to_clean))
        tokenized_list = list(map(self.tokenizer, cleaned_corpus))
        lemmed_list = [' '.join(list(map(self.lemmer, item))) for item in tokenized_list]
        return pd.DataFrame(self.vectorizer.transform(lemmed_list).toarray(), 
                            columns=self.vectorizer.get_feature_names())

In [None]:
pd.DataFrame(doc_word.toarray(), index=ex_label, columns=vectorizer.get_feature_names()).head(10)

##### NCIS Pipeline

In [34]:
nlp = NLPProcessor(CountVectorizer(stop_words='english'), 
               TreebankWordTokenizer().tokenize, 
               clean_tv_scripts, 
               WordNetLemmatizer().lemmatize)

nlp.fit(ncis_df['dialogue'])
ncis_dtm = nlp.transform(ncis_df['dialogue'])

In [None]:
ncis_big_vocab = nlp.vectorizer
pickle.dump(ncis_big_vocab, open("ncis_cv.pkl", "wb"))

In [None]:
nlp = NLPProcessor(CountVectorizer(stop_words=stop_words,max_df=.95,
                                  min_df=.05), 
               TreebankWordTokenizer().tokenize, 
               clean_tv_scripts, 
               WordNetLemmatizer().lemmatize)

nlp.fit(ncis_df['dialogue'])
ncis_dtm_stopmm = nlp.transform(ncis_df['dialogue'])

In [None]:
ncis_vocab = nlp.vectorizer
pickle.dump(ncis_vocab, open("ncis_cv_stopmm.pkl", "wb"))

##### The Shield Pipeline

In [19]:
nlp = NLPProcessor(CountVectorizer(stop_words='english'), 
               TreebankWordTokenizer().tokenize, 
               clean_tv_scripts, 
               WordNetLemmatizer().lemmatize)

nlp.fit(shield_df['dialogue'])
shield_dtm = nlp.transform(shield_df['dialogue'])

In [21]:
shield_big_vocab = nlp.vectorizer
pickle.dump(shield_big_vocab, open("./data/cross_vec/shield_cv.pkl", "wb"))

In [None]:
nlp = NLPProcessor(CountVectorizer(stop_words='english',max_df=.95,min_df=.05), 
               TreebankWordTokenizer().tokenize, 
               clean_tv_scripts, 
               WordNetLemmatizer().lemmatize)

nlp.fit(shield_df['dialogue'])
shield_dtm_stopmm = nlp.transform(shield_df['dialogue'])

In [None]:
shield_vocab = nlp.vectorizer
pickle.dump(shield_vocab, open("./data/cross_vec/shield_cv_stopmm.pkl", "wb"))

##### Pickling & Indexing Each DTM for Modeling

In [35]:
ncis_dtm.index = ncis_df.index
ncis_dtm_stopmm.index = ncis_df.index
shield_dtm.index = shield_df.index
shield_dtm_stopmm.index = shield_df.index

NameError: name 'ncis_dtm_stopmm' is not defined

In [36]:
pickle.dump(ncis_dtm, open("./data/dtm/ncis_dtm.pkl", "wb"))
pickle.dump(ncis_dtm_stopmm, open("./data/dtm/ncis_dtm_stopmm.pkl", "wb"))
pickle.dump(shield_dtm, open("./data/dtm/shield_dtm.pkl", "wb"))
pickle.dump(shield_dtm_stopmm, open("./data/dtm/shield_dtm_stopmm.pkl", "wb"))

NameError: name 'ncis_dtm_stopmm' is not defined