In [1]:
import os
import re
import numpy as np
import pandas as pd
import gensim
import collections
from nltk.corpus import stopwords
import nltk

import json

from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists, drop_database
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

  "is going to be overriden.".format(identifier))


### Import packages to perform Text Summarization on Youtube Video Comments

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

# build a list of stopwords
stopwords = list(STOP_WORDS)
# load spacy english language
nlp = spacy.load('en')

In [3]:
# function to summarize comments for 
# each video with >=N number of comments
def text_summarizer(document):
    """
    input: concatenated comments >= 10 min read time
          which is approximately 2000 characters
    return: a summarized version of the comments of
           about 2-4 minutes read time (400-800 characters)
    """
    # raw_text = document
    doc = nlp(document)
    # build word frequency
    # word.text is tokenization in spacy
    word_frequencies = {}  
    for word in doc:  
        if word.text not in stopwords:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    # sentence tokens
    sentence_list = [ sentence for sentence in doc.sents ]

    # calculate sentence score and ranking
    sentence_scores = {}  
    for sent in sentence_list:  
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if len(sent.text.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

    # find N largest
    summary_sentences = nlargest(6, sentence_scores, key=sentence_scores.get)
    
    summarized_sentences_cleaned = []
    for ss in summary_sentences:
        if ss.text.find(', ')==0:
            idx = ss.text.find(', ')
            new_ss = ss[idx+1:].text.capitalize()
        else:
            new_ss = ss.text.capitalize()
            new_ss = ' '.join(new_ss.split())
        summarized_sentences_cleaned.append(new_ss)
    
#     final_sentences = []
#     for w in summarized_sentences_cleaned:
#         for i, a in enumerate(w.text):
#             if a.isalpha():
#                 break
#         final_sentences.append(w.text[i:].capitalize())
    
    summary = ' '.join(summarized_sentences_cleaned)
    summary = summary.replace(u'\xa0', u'')
#     print("Original Document\n")
#     print(document)
#     print("Total Length:",len(document))
#     print('\n\nSummarized Document\n')
#     print(summary)
#     print("Total Length:",len(summary))
    return summary

### Load original csv file of video titles, subtitles, comments, labels, etc.

In [4]:
from_csv_filename = 'data/csv_files/original_data.csv'
data_df = pd.read_csv(from_csv_filename, sep=',')

In [5]:
#data_df.sort_values('txt_len', ascending=False).head()
cols = list(data_df.columns)
cols.append('summary')
#cols

In [6]:
new_df = pd.DataFrame(columns=cols)

In [7]:
data_df = data_df[(data_df['txt_len']>=2000) & (data_df['txt_len']<100000)].copy()

In [8]:
data_df.reset_index(inplace=True, drop=True)

In [9]:
video_id_list = list(data_df.video_id)
len(video_id_list)

1037

In [10]:
data_df.head()

Unnamed: 0,video_id,title,subtitles,primary_category,text,txt_len
0,09Q4JQ3p8yg,How to remove popcorn stipple ceiling,hi Shannon here from health improvements and t...,drywall_repair,"asmr **, I love you, Im here from Instagram lm...",90480
1,0Aip_xxpia4,How to install carpet tiles,were going to replace this tired old wall-to-w...,carpet_flooring,Interface do a product called tactiles which a...,2183
2,0COOF3BwgKI,Fix small nail holes in walls fast and make th...,so you like to hang pictures on your wall if y...,drywall_repair,Glad I could help then. Be sure and subscribe...,8009
3,0J8q_Lsh4fU,Hog wire deck rail installation,hi im paul from Elkins diy.com today Im going ...,build_deck,"Thanks Naegling, off topic but where are u get...",8102
4,0RuwaSU71rY,Replacing a section of drywall after a pipe leak,hi Im Mike Thompson last night was a bit of an...,drywall_repair,"Great Job , Mike Thompson, good instructional ...",4012


In [11]:
import sys
for i, vid in enumerate(video_id_list):
    document = str(data_df.iloc[i].text)
    summary = text_summarizer(document)
    new_df.loc[i] = [str(data_df.loc[i].video_id), str(data_df.loc[i].title), 
                     str(data_df.loc[i].subtitles), str(data_df.loc[i].primary_category), 
                     str(data_df.loc[i].text), str(data_df.loc[i].txt_len), str(summary)]

In [12]:
new_df['summary'].str.len()

0       656
1       398
2       658
3       522
4       623
5       593
6       596
7       464
8       561
9       555
10      495
11      688
12      505
13      719
14      745
15      668
16      584
17      533
18      343
19      636
20      601
21      559
22      476
23      692
24      631
25      630
26      627
27      895
28      624
29      634
       ... 
1007    589
1008    650
1009    661
1010    443
1011    674
1012    559
1013    558
1014    558
1015    555
1016    680
1017    736
1018    626
1019    374
1020    506
1021    579
1022    692
1023    652
1024    579
1025    677
1026    423
1027    552
1028    755
1029    523
1030    670
1031    641
1032    729
1033    757
1034    509
1035    844
1036    566
Name: summary, Length: 1037, dtype: int64

In [13]:
new_df['summary_len'] = new_df['summary'].str.len()

In [14]:
new_df.summary_len

0       656
1       398
2       658
3       522
4       623
5       593
6       596
7       464
8       561
9       555
10      495
11      688
12      505
13      719
14      745
15      668
16      584
17      533
18      343
19      636
20      601
21      559
22      476
23      692
24      631
25      630
26      627
27      895
28      624
29      634
       ... 
1007    589
1008    650
1009    661
1010    443
1011    674
1012    559
1013    558
1014    558
1015    555
1016    680
1017    736
1018    626
1019    374
1020    506
1021    579
1022    692
1023    652
1024    579
1025    677
1026    423
1027    552
1028    755
1029    523
1030    670
1031    641
1032    729
1033    757
1034    509
1035    844
1036    566
Name: summary_len, Length: 1037, dtype: int64

In [15]:
new_df.summary_len = new_df.summary_len.astype(int)

In [16]:
new_df.txt_len = new_df.txt_len.astype(int)

In [17]:
new_df['prcnt_reduce'] = ((new_df.txt_len - new_df.summary_len) / new_df.txt_len) * 100

In [18]:
new_df.prcnt_reduce.mean()

91.91334399199954

In [19]:
new_df['prcnt_reduce'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fd11fb41d68>

In [20]:
to_csv_filename = 'data/csv_files/summarized_data.csv'

In [21]:
new_df.to_csv(to_csv_filename, sep=',', encoding='utf-8', index=False)