[toxic words data](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data)

In [86]:
import pandas as pd
import numpy as np
import pickle
import time
import requests

from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.mixture import GaussianMixture
from sklearn import neighbors, ensemble,tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit

from selenium.webdriver import Chrome
from contextlib import closing
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

import warnings 
warnings.simplefilter('ignore')

In [66]:
#import test.csv
test = pd.read_csv('test.csv')

#import test labels.csv
test_labels = pd.read_csv('test_labels.csv')

#merge test and test_labels as test_full
test_full = test.merge(test_labels, on="id")

#import train.csv
train = pd.read_csv('train.csv')

#merge train and test_full as toxic
toxic = test_full.merge(train, how='outer')
toxic.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [67]:
toxic['toxic_total'] = (toxic['toxic'] + toxic['severe_toxic'] +
                        toxic['obscene'] + toxic['threat'] + 
                        toxic['insult'] + toxic['identity_hate'])
toxic['toxic_bool'] = np.where((toxic['toxic'] + toxic['severe_toxic'] +
                        toxic['obscene'] + toxic['threat'] + 
                        toxic['insult'] + toxic['identity_hate'])<0,-1,0)
toxic.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_total,toxic_bool
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1,-6,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1,-6,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1,-6,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1,-6,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1,-6,-1


# TF-IDF

In [70]:
# Define the features and the outcome.
X = toxic['comment_text']
y = toxic['toxic_bool']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.25, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=3, # only use words that appear at least three times
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

#Applying the vectorizer
toxic_tfidf=vectorizer.fit_transform(X)
#print("Number of features: %d" % paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(toxic_tfidf, test_size=0.25, random_state=0)

#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()
X_test_tfidf_csr = X_test_tfidf.tocsr()

#number of sentences
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per sentence
tfidf_bysent = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each sentence, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bysent[i][terms[j]] = X_train_tfidf_csr[i, j]

# Normalize the data.
X_train_norm = normalize(X_train_tfidf_csr)
X_test_norm = normalize(X_test_tfidf_csr)

# Convert y to categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [71]:

pickle.dump(vectorizer,open('tf_idf_vectorize.sav','wb'))

# Modeling

In [45]:
# Spot Check Algorithms
start_time = time.time()
models = []
models.append(('NBB', BernoulliNB()))
models.append(('RFC', ensemble.RandomForestClassifier()))
models.append(('KNN', neighbors.KNeighborsClassifier()))
models.append(('DTC', tree.DecisionTreeClassifier()))
models.append(('SVC', SVC()))
models.append(('GBC', ensemble.GradientBoostingClassifier()))
models.append(('ABC', ensemble.AdaBoostClassifier()))
models.append(('ETC', ensemble.ExtraTreesClassifier()))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    split = StratifiedShuffleSplit(n_splits=3, random_state=1337)
    model = model.fit(X_train_norm,y_train)
    cv_results = cross_val_score(model, X_test_tfidf_csr, y_test, cv=split, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
t= round((time.time() - start_time),4)
print("\n -- %s seconds for results--" % t)

NBB: 0.687940 (0.006286)
RFC: 0.701906 (0.004486)
KNN: 0.540389 (0.007919)


KeyboardInterrupt: 

In [85]:
split = StratifiedShuffleSplit(n_splits=3, random_state=1337)
DTC = tree.DecisionTreeClassifier()
DTC.fit(X_train_norm,y_train)
cv_results = cross_val_score(SVC, X_test_tfidf_csr, y_test, cv=split, scoring='accuracy')
msg = "%s: %f (%f)" % ('DTC', cv_results.mean(), cv_results.std())
print(msg)

TypeError: get_params() missing 1 required positional argument: 'self'

In [72]:
print(DTC.score())

NameError: name 'DTC' is not defined

In [None]:
split = StratifiedShuffleSplit(n_splits=3, random_state=1337)
SVC = SVC()
SVC.fit(X_train_norm,y_train)
cv_results = cross_val_score(SVC, X_test_tfidf_csr, y_test, cv=split, scoring='accuracy')
msg = "%s: %f (%f)" % ('SVC', cv_results.mean(), cv_results.std())
print(msg)

# Get Dem Comments

In [78]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [76]:
API_KEY = 'AIzaSyCtz3r1zXv0gEVa2R_D7xTHDLmkOKoTvvI'

channel_ids= ['UCb4G6Wao_DeFr1dm8-a9zjg'] #input at some point

for channel in channel_ids:
    channel_url = f'https://www.googleapis.com/youtube/v3/channels?part=contentDetails&id={channel}&key={API_KEY}'
channel_videos = requests.get(channel_url).json()

#pull upload playlist ID
playlist_id = channel_videos['items'][0]['contentDetails']['relatedPlaylists']['uploads']

#generate playlist url
playlist_url = f'https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&maxResults=50&playlistId={playlist_id}&key={API_KEY}'

#pulling video ids from uploads playlist
video_ids = requests.get(playlist_url).json()

#.json of video IDs
video_ids

#print video IDs from .json
video_id_values = []
for i in video_ids['items']:
    try:
        video_id_values.append(i['snippet']['resourceId']['videoId'])
    except:
        print('No video ids')
        pass
len(video_id_values)

50

In [77]:
video_id_values[:10]

['fQQIOx8yzgM',
 'CQWlCRVNJ3E',
 'C0SLsjNI2eI',
 'NXCFjJ5aaPM',
 'flKO72R7RPw',
 'cN1hO-akbuw',
 'OUAT-Ij6Ep4',
 'GKgpAfUcW90',
 'kNtIrtx1_0Q',
 'aWRVqPN68Ls']

In [28]:
options = Options()
options.set_headless(headless=True)
driver = webdriver.Chrome(options=options)
driver.get("http://google.com/")
print ("Headless Chrome Initialized")
driver.quit()

Headless Chrome Initialized


In [108]:
chrome_options = Options()
chrome_options.set_headless(headless=True)
#chrome_options.add_argument("--headless")
comment_list = []

for v in video_id_values[:10]:
    with closing(Chrome(chrome_options=chrome_options)) as driver:
        wait = WebDriverWait(driver,10)
        driver.get(f"https://www.youtube.com/watch?v={v}")

        for item in range(3): #by increasing the highest range you can get more content 
            wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
            time.sleep(5) #5 second delay before iteration, more means more time to scrap

        for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#comment #content-text"))):
            comment_list.append(comment.text)
            #print(comment.text)
print('done!')
#this may take a few minutes

done!


In [109]:
for v in video_id_values[10:20]:
    with closing(Chrome(chrome_options=chrome_options)) as driver:
        wait = WebDriverWait(driver,10)
        driver.get(f"https://www.youtube.com/watch?v={v}")

        for item in range(3): #by increasing the highest range you can get more content 
            wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
            time.sleep(5) #5 second delay before iteration, more means more time to scrap

        for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#comment #content-text"))):
            comment_list.append(comment.text)
            #print(comment.text)
print('done!')
#this may take a few minutes

done!


In [None]:
for v in video_id_values[20:30]:
    with closing(Chrome(chrome_options=chrome_options)) as driver:
        wait = WebDriverWait(driver,10)
        driver.get(f"https://www.youtube.com/watch?v={v}")

        for item in range(3): #by increasing the highest range you can get more content 
            wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
            time.sleep(5) #5 second delay before iteration, more means more time to scrap

        for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#comment #content-text"))):
            comment_list.append(comment.text)
            #print(comment.text)
print('done!')
#this may take a few minutes

In [None]:
for v in video_id_values[30:40]:
    with closing(Chrome(chrome_options=chrome_options)) as driver:
        wait = WebDriverWait(driver,10)
        driver.get(f"https://www.youtube.com/watch?v={v}")

        for item in range(3): #by increasing the highest range you can get more content 
            wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
            time.sleep(5) #5 second delay before iteration, more means more time to scrap

        for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#comment #content-text"))):
            comment_list.append(comment.text)
            #print(comment.text)
print('done!')
#this may take a few minutes

In [None]:
for v in video_id_values[40:50]:
    with closing(Chrome(chrome_options=chrome_options)) as driver:
        wait = WebDriverWait(driver,10)
        driver.get(f"https://www.youtube.com/watch?v={v}")

        for item in range(3): #by increasing the highest range you can get more content 
            wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
            time.sleep(5) #5 second delay before iteration, more means more time to scrap

        for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#comment #content-text"))):
            comment_list.append(comment.text)
            #print(comment.text)
print('done!')
#this may take a few minutes

In [88]:
len(comment_list)

762

In [89]:
input_comments = pd.DataFrame(comment_list)
input_comments.columns = ['comments']
weird = [':','%',';',',','.','!','\"','\'','*','&','#','@','$','+','=','-','_']
for item in weird:
    input_comments['comments'] = input_comments['comments'].apply(lambda x : x.replace(item,''))

input_comments['comments']

0      I saw this live and I just want to say  I alre...
1                             We love you Kinda Funny <3
2                       Thanks for coming by Brian  Greg
3      Fantastic episode Please get Greg on Between t...
4      Hey have you guys considered having Sam   Rieg...
5      So does this count as the Brian episode for Be...
6      Ive been waiting for Brian to get a Between th...
7      Can we just get a 12 hour episode of these 2 j...
8      He flicked the Bluetooth  Ive never admired a ...
9      How the hell did you get Brian on the stage wi...
10     This is Brian’s Between the Sheets\n\nAll of t...
11     Holy shit I need to see more of this guy What ...
12     Brian became a hero of mine today That story o...
13     When I saw this was happening I freaked out Th...
14                                      Bidet Critters 😊
15     Dude I hit 2 years clean on Friday and this st...
16     Kinda funny introduced me to crit role Thanks ...
17     Brian  randomly namedrop

In [None]:
# run it through TF-IDF

In [90]:
# Define the features and the outcome.
X = input_comments

#Applying the vectorizer
toxic_tfidf=vectorizer.transform(X)
#print("Number of features: %d" % paras_tfidf.get_shape()[1])

#splitting into training and test sets
#X_train_tfidf, X_test_tfidf= train_test_split(toxic_tfidf, test_size=0.25, random_state=0)

#Reshapes the vectorizer output into something people can read
X_tfidf_csr = toxic_tfidf.tocsr()

#number of sentences
n = X_tfidf_csr.shape[0]
#A list of dictionaries, one per sentence
tfidf_bysent = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each sentence, lists the feature words and their tf-idf scores
for i, j in zip(*X_tfidf_csr.nonzero()):
    tfidf_bysent[i][terms[j]] = X_tfidf_csr[i, j]

# Normalize the data.
X_norm = normalize(X_tfidf_csr)

In [104]:
filename = 'toxic_ABC.sav'
model = pickle.load(open(filename, 'rb'))
test_result = model.score(X_test_norm, y_test)
print(f"The accuracy of this model on test data is {round(test_result*100,2)}% ")

The accuracy of this model on test data is 69.46% 


In [107]:
#filename = ''
#model = pickle.load(open(filename, 'rb'))
#new_data_result = model.predict(X_norm).mean()
new_data_result = model.predict_proba(X_norm)[0].mean() #probaility of being a toxic comment, [0] for not toxic
print(f"The probability of this community being toxic is predicted to be {round(new_data_result*100,2)}%")


The probability of this community being toxic is predicted to be 50.0%
