In [1]:
import pandas as pd
import numpy as np
import string
import re
from collections import defaultdict
import truecase
import pickle

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

import spacy
nlp = spacy.load('en_core_web_sm')

from corenlp_pywrap import pywrap

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v4 = pickle.load(open('../Data/df_videos_cleaned_v4.pickle', 'rb'))

### All the functions (Need to put these into a separate .py file)

In [3]:
def text_preprocessing_pipeline_1(df):
    '''
    Input: Dataframe with the raw transcript text in a string format
    Output: Dataframe with line breaks, punctuations, and numbers removed from the transcript text
    '''
    ## Create a function for applying the text preprocessing pipeline
    def initial_preprocessing(raw_text):
        preprocessed_text_1 = raw_text.replace('\n', ' ')
        preprocessed_text_2 = preprocessed_text_1.translate(str.maketrans('', '', string.punctuation.replace("'", "")))
        preprocessed_text_3 = re.sub('\w*\d\w*', '', preprocessed_text_2)
#         preprocessed_text_4 = truecase.get_true_case(preprocessed_text_3)
        
        return preprocessed_text_3
    
    df['Transcript'] = df['Transcript'].apply(initial_preprocessing)
            
    return df   

In [4]:
def text_preprocessing_pipeline_2(df):
    '''
    Input: Dataframe after applying text_processing_pipeline_1
    Output: Dataframe with the transcript further preprocessed - tokenization, stopwords removal, lemmatization
    '''
    ## Create a function for applying the text preprocessing pipeline
    def second_preprocessing(preprocessed_text_3):
        stopwords = spacy.lang.en.stop_words.STOP_WORDS
        
        preprocessed_text_4 = nlp(preprocessed_text_3)
        preprocessed_text_5 = [word.text for word in preprocessed_text_4 
                                    if str(word).lower() not in stopwords and word.text!= ' ']
        
        lemmatizer = WordNetLemmatizer()
        tag_map = defaultdict(lambda : wordnet.NOUN)
        tag_map['V'] = wordnet.VERB 
        tag_map['J'] = wordnet.ADJ
        tag_map['R'] = wordnet.ADV

        preprocessed_text_6 = [lemmatizer.lemmatize(word.lower(), tag_map[tag[0]]) 
                                    for word, tag in pos_tag(preprocessed_text_5)]
        
        return preprocessed_text_6
    
    df['Transcript'] = df['Transcript'].apply(second_preprocessing)
            
    return df   

In [5]:
def text_preprocessing_pipeline_3(df):
    '''
    Input: Dataframe after applying text_preprocessing_pipeline_2
    Output: Dataframe with the transcript further preprocessed - truecasing, part of speech tagging
    '''
    ## Create a function for applying the text preprocessing pipeline
    def third_preprocessing(preprocessed_text_6):
        preprocessed_text_7 = ' '.join([word for word in preprocessed_text_6 if len(word)>1])
        preprocessed_text_8 = truecase.get_true_case(preprocessed_text_7)
        preprocessed_text_9 = nlp(preprocessed_text_8)
        preprocessed_text_10 = [(word.text, word.pos_) for word in preprocessed_text_9]
        
        return preprocessed_text_10
    
    df['Transcript'] = df['Transcript'].apply(third_preprocessing)
            
    return df  

In [6]:
def pickle_df(df_str):
    '''
    Input: Name of a dataframe in a string format
    Output: Pickle the dataframe into the Data folder
    '''  
    with open('../Data/'+ df_str +'.pickle', 'wb') as f_video_data:
        pickle.dump(eval(df_str), f_video_data)

### Remove line breaks, punctuations, and numbers

In [7]:
df_videos_cleaned_v5 = text_preprocessing_pipeline_1(df_videos_cleaned_v4)

In [8]:
# df_videos_cleaned_v5['Transcript'][1]

### Tokenization, stopwords removal and lemmatization

In [9]:
df_videos_cleaned_v6 = text_preprocessing_pipeline_2(df_videos_cleaned_v5)

In [10]:
# ' '.join(df_videos_cleaned_v6['Transcript'][1])

### Truecasing, POS tagging

In [11]:
df_videos_cleaned_v7 = text_preprocessing_pipeline_3(df_videos_cleaned_v6)

In [12]:
# df_videos_cleaned_v7['Transcript'][0]

### Pickle the cleaned dataframe 

In [13]:
pickle_df('df_videos_cleaned_v7')