In [12]:
import pandas as pd
import json
import requests
import re
import pickle
import numpy as np
from keras.models import load_model
from tensorflow.keras import backend 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# loading tokenizer
with open('model/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
# loading model
model = load_model('model/model_weights.hdf5')

# loading sentiment tokenizer
with open('model/sentiment_tokenizer.pickle', 'rb') as handle:
    sentiment_tokenizer = pickle.load(handle)
# loading sentiment model
sentiment_model = load_model('model/sentiment_model.h5')

In [2]:
def predict_sentiment(input_text):
    sequence = sentiment_tokenizer.texts_to_sequences(input_text)
    padded_sequence = pad_sequences(sequence,maxlen=121)
    prediction = sentiment_model.predict(padded_sequence)
    return prediction

def process_sentiment_predictions(predictions):
    output = []
    for pred in predictions:
        if pred[0] > .5:
            output.append(1)
        else:
            output.append(0)
    return output

def predict_output(input_text):
    sequence = tokenizer.texts_to_sequences(input_text)
    padded_sequence = pad_sequences(sequence,maxlen=100)
    prediction = model.predict(padded_sequence)
    return prediction

def process_prediction(input_array):
    indices = np.nonzero(input_array > .5)
    labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
    output_labels = []
    for index in indices[0]:
        output_labels.append(labels[index])
    return output_labels

In [30]:
def fetch_comments_from_all_video(list_of_video_ids):
    output = []
    for video_id in list_of_video_ids:
        resp = fetch_comments_from_video(video_id)
        output.extend(resp)
    return output

def fetch_comments_from_video(video_id):
    first = True
    key = "AIzaSyDjfPt6jcTQUDeY1nylb6l1c3LMITa57OI"
    output = []
    while True:
        if first:
            uri = "https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={}&key={}&maxResults=100".format(video_id,key)
            first = False
        else:
            uri = "https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={}&key={}&maxResults=100&pageToken={}".format(video_id,key,next_page_token)
        resp = requests.get(uri)
        if resp.status_code == 200:
            result = resp.json()
            total_result_returned = result['pageInfo']['totalResults']
            next_page_token = result.get('nextPageToken')
            comments = result['items']
            comments = [{'video_id':video_id,'text':clean_text(comment['snippet']['topLevelComment']['snippet']['textDisplay']),'user':comment['snippet']['topLevelComment']['snippet']['authorDisplayName']} for comment in comments]
            output.extend(comments)
        if not next_page_token:
            break
    return output

def clean_text(input_text):
    # remove user (@user)
    text = re.sub(r'@[\w]*', ' ',input_text)
    
    # remove special characters, numbers, punctuations
    text = re.sub(r'[^a-zA-Z#]', ' ',text)
    
    # remove two or more spaces by one space
    text = re.sub(r' +', ' ',text)
    
    text = text.lower()
    
    return text.strip()
        
    
def create_dataframe_with_sentiments_and_toxic_comments(output):
    df = pd.DataFrame(output)
    comments = df['text'].values
    toxic_comments_predictions = predict_output(comments)
    sentiment_predictions = predict_sentiment(comments)
    result_toxic = []
    for pred in toxic_comments_predictions:
        label = process_prediction(pred)
        if not label:
            result_toxic.append(False)
        else:
            result_toxic.append(','.join(label))

    result_sentiment = process_sentiment_predictions(sentiment_predictions)
    print(len(result_toxic))
    print(len(result_sentiment))
    df['toxic_comment'] = result_toxic
    df['sentiment'] = result_sentiment
    return df

# process dataframe with predictions
def return_top_n_records_with_toxic_comments(channel_id,num_videos,num_records,sorted_by):
    list_of_video_ids = get_vidoes_id_list(channel_id,num_videos)
    output = fetch_comments_from_all_video(list_of_video_ids)
    df = create_dataframe_with_sentiments_and_toxic_comments(output)
    grouped = df.groupby('user')
    processed_data = []
    for name , group in grouped:
        positive = 0
        negative = 0
        toxic_comment_count = 0
        for index,record in group.iterrows():
            if record['sentiment'] == 0:
                negative += 1
            if record['sentiment'] == 1:
                positive += 1
            if record['toxic_comment'] != False:
                toxic_comment_count+=1
        processed_data.append({'user':name,'positive':positive,'negative':negative,'toxic_comment_count':toxic_comment_count})
    new_df = pd.DataFrame(processed_data)
    new_df = new_df.sort_values(by=[sorted_by], ascending=False)
    return new_df

def get_vidoes_id_list(channel_id, record_limit):
    youkeys = 'AIzaSyDjfPt6jcTQUDeY1nylb6l1c3LMITa57OI'
    params = { "part": "snippet" , "channelId": channel_id , "maxResults": record_limit , "order": "date" , "type": "video", "key": youkeys }
    url_youtube = 'https://www.googleapis.com/youtube/v3/search'
    video_list = []
    headers = { "Connect-Type": "appication/json" }
    reponse = requests.get(url=url_youtube, verify = False, headers=headers , params=params)
    number_of_videos = len(reponse.json()['items'])
    if reponse.status_code == 200:
        if number_of_videos >= int(record_limit):
            for i in range(int(record_limit)):
                reponse_video_id = reponse.json()['items'][i]['id']['videoId']
                video_list.append(reponse_video_id)
        else:
            for i in range(number_of_videos):
                reponse_video_id = reponse.json()['items'][i]['id']['videoId']
                video_list.append(reponse_video_id)
    else:
        print("Please check your API")
    return video_list

In [32]:
resp = return_top_n_records_with_toxic_comments('UC-ewOmM-a2mrqiuaBFALtAw',20,50,'toxic_comment_count')
resp.head(20)



5268
5268


Unnamed: 0,negative,positive,toxic_comment_count,user
485,2,0,2,Bro_Jules
365,2,0,2,BP 009
656,2,0,2,Crasher 47
4061,1,1,2,meeko chris
294,1,2,2,Aponyx
3866,1,1,2,es gibt keinen vornamen/nachnamen
437,3,1,2,Bike Rocks
2246,1,2,2,Max
486,0,2,2,Bronco scruffybone
634,1,1,1,Cloud_of _Cookies


In [28]:
resp.head(20).t

Unnamed: 0,negative,positive,toxic_comment_count,user
27,4,1,0,Ameen prince
202,4,2,0,Ikueze Chidera
60,4,5,1,Bee S.C
317,4,3,0,Mike Mullay
504,3,0,0,cesar garduno
354,3,3,0,P nats
25,3,3,0,Alpha chic
388,2,3,0,RoilNavE
35,2,1,0,AnnieNorthman89
468,2,0,1,Vepř Domácí
