# Application of Toxic-Community Model

Here we are applying the trained toxic comment identification model!

In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import requests
import matplotlib.pyplot as plt

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer

from selenium.webdriver import Chrome
from contextlib import closing
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import warnings 
warnings.simplefilter('ignore')

# Get Comments
Now we'll be getting the comments from the community we'd like to analyze. As a test I've selected a channel that is proud of it's community and believes it to be largely non-toxic. This community is [KindaFunny](https://www.youtube.com/channel/UCb4G6Wao_DeFr1dm8-a9zjg).

We'll be accessing the Youtube API to get a list of the 50 most recent video IDs.

In [3]:
API_KEY = #put your API key here

channel_ids= ['UCb4G6Wao_DeFr1dm8-a9zjg'] #input at some point

for channel in channel_ids:
    channel_url = f'https://www.googleapis.com/youtube/v3/channels?part=contentDetails&id={channel}&key={API_KEY}'
channel_videos = requests.get(channel_url).json()

#pull upload playlist ID
playlist_id = channel_videos['items'][0]['contentDetails']['relatedPlaylists']['uploads']

#generate playlist url
playlist_url = f'https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&maxResults=50&playlistId={playlist_id}&key={API_KEY}'

#pulling video ids from uploads playlist
video_ids = requests.get(playlist_url).json()

#.json of video IDs
video_ids

#print video IDs from .json
video_id_values = []
for i in video_ids['items']:
    try:
        video_id_values.append(i['snippet']['resourceId']['videoId'])
    except:
        print('No video ids')
        pass
print(f'We have video ids for {len(video_id_values)} videos')

We have video ids for 50 videos


Now we will iterate through this list to pull comments from each of these 50 videos. This takes some time and if this process were ever productionized for the public then some adjustments would have to be made as I don't think the customer would be interested in waiting about 30 minutes for results. If it were productionized provately then it could be adjusted to get even more comments across these videos.

In [6]:
start_time = time.time()

newest_comments = []

for v in video_id_values:
    options = Options()
    #options.headless = True
    driver = webdriver.Chrome(chrome_options=options)
    driver.maximize_window()
    try:
        driver.get(f'https://www.youtube.com/watch?v={v}')
        wait = WebDriverWait(driver,20)
        time.sleep(1)
        driver.find_element_by_tag_name('body').send_keys('m')
        #down arrows to move down page
        wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.DOWN)
        for i in range(15):
            driver.find_element_by_tag_name('body').send_keys(Keys.DOWN)    
        #select and click 'sort by' drop down menu
        time.sleep(1)
        wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="trigger"]'))).click()
        time.sleep(1)
        #select 'Newest Comments' from drop down menu
        wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="menu"]/a[2]/paper-item/paper-item-body/div[1]'))).click()

        for item in range(8): #by increasing the highest range you can get more content 
                #scroll down the page further
                driver.find_element_by_tag_name('body').send_keys(Keys.END)                            
                time.sleep(1) #higher means more time to load the page as we scroll

        for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#comment #content-text"))):
                newest_comments.append(comment.text)
                #print(comment.text)
        driver.quit()
        
    except:
        driver.quit()
        pass

print ("done!")
print(start_time-time.time(), 'secs')


done!
-1371.5600349903107 secs


In [7]:
#check the number of comments we gathered
len(newest_comments)

3577

In [30]:
#load into dataframe
input_comments = pd.DataFrame(newest_comments)
#name comments column
input_comments.columns = ['comments']
#clean odd characters
weird = [':','%',';',',','.','!','\"','\'','*','&','#','@','$','+','=','-','_']
for item in weird:
    input_comments['comments'] = input_comments['comments'].apply(lambda x : x.replace(item,''))

#input_comments.to_csv('kf_comments.csv')
print(input_comments.head())

                                            comments
0  Videogames seem to catchup to high end cgi so ...
1                      THE CUT TO WHEN GREG SAID CAT
2  Yea this movie sucks everyone except Andy are ...
3  Clapping at the theater shouldnt be used as a ...
4  Andys wheeze laugh is one of my favorite thing...


In [14]:
#load saved .csv to save time later
input_com = pd.read_csv('kf_comments.csv')

In [27]:
#import vectorizer
file = open('tf_idf_vectorize.sav','rb')
vectorizer = pickle.load(file)
file.close()

#Applying the vectorizer
toxic_tfidf=vectorizer.transform(input_com)

#Reshapes the vectorizer output into something people can read
X_tfidf_csr = toxic_tfidf.tocsr()

#number of sentences
n = X_tfidf_csr.shape[0]
#A list of dictionaries, one per sentence
tfidf_bysent = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each sentence, lists the feature words and their tf-idf scores
for i, j in zip(*X_tfidf_csr.nonzero()):
    tfidf_bysent[i][terms[j]] = X_tfidf_csr[i, j]

# Normalize the data.
input_X_norm = normalize(X_tfidf_csr)

print('done!')

done!


In [24]:
#load the pickled fit-model
filename = 'toxic_ETC.sav'
file = open(filename, 'rb')
model = pickle.load(file)
file.close()

In [29]:
#Next we take the mean of the probability of each comment being toxic
#[1] is the probaility of being a toxic comment, [0] is the probability for not toxic
new_data_result = model.predict_proba(input_X_norm).mean(axis=0)[1]

#For [1] value closer to 0 is less toxic, for [0] value closer to 1 is less toxic
print(f"The probability of this community being toxic is {round(new_data_result*100,2)}%")

The probability of this community being toxic is 18.69%


This is congruent with what the community anticipated, claiming that their community is largely non-toxic.

### Consider checking out these other channels:
* FoxNews = "UCXIJgqnII2ZOINSWNOGFThA"
* JoeRogan = 'UCzQUP1qoWDoEbmsQxvdjxgQ' 
* PewDiePie = 'UC-lHJZR3Gqxm24_Vd_AJ5Yw' - 
* CriticalRole = 'UCpXBGqwsBkpvcYjsJBQ7LEQ' 
* PhillipDeFranco = "UClFSU9_bUb4Rc6OYfTt5SPw" 
* RetroReplay = "UCx42AsHSKnjEfPXk0YiGCyw"
#### or pick your own!