In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

# read the dataset into pandas dataframe
df = pd.read_csv('./../datasets/downsampled_data', delim_whitespace=False).dropna()

In [42]:
# display the first five rows
retrieval_time = df['time_retrieved']
publish_time = df['publishedAt']
channel_publish_time = df['Channel_publishedAt']
retrieval_time_11_19_14 = df['11_19_14_update_timestamp']
columns_to_drop = ['definition', 'publishedAt', 'time_retrieved', '11_19_14_update_timestamp', 'Channel_publishedAt', 'video_id', 'channelId', 'thumbnail_link', 'Channel_country']
df = df.drop(columns_to_drop, axis = 1)
df.columns

Index(['Unnamed: 0', 'title', 'channelTitle', 'categoryId', 'duration', 'tags',
       'ratings_disabled', 'description', 'Channel_viewCount',
       'Channel_subscriberCount', 'Channel_hiddenSubscriberCount',
       'Channel_videoCount', 'Channel_title', 'Channel_description',
       'view_count_update_11_19_14', 'likes_update_11_19_14',
       'dislikes_update_11_19_14', 'comment_count_update_11_19_14',
       'trending?', 'engagement_rate', 'INTL', 'UNK', 'USA'],
      dtype='object')

In [43]:
## time updates
import dateutil.parser as parser
age = []
age_update = []
channel_age = []
for i in df.index:
    channel_publish_time[i] = channel_publish_time[i].replace("\"", "")
    age.append(parser.isoparse(retrieval_time[i]) - parser.isoparse(publish_time[i]))
    age_update.append(parser.isoparse(retrieval_time_11_19_14[i]) - parser.isoparse(publish_time[i]))
    channel_age.append(parser.isoparse(channel_publish_time[i]) - parser.isoparse(publish_time[i]))
    

In [44]:
## sentiment values
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

titles = df['title']
channel_title = df['Channel_title']
description = df['description']
channel_description = df['Channel_description']

title_sentiment_vals = []
channel_title_sentiment_vals = []
description_sentiment_vals = []
channel_description_sentiment_vals = []

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
for sentence in titles:
    ss = sid.polarity_scores(str(sentence))
    title_sentiment_vals.append(ss['pos']-ss['neg'])
    
for sentence in channel_title:
    ss = sid.polarity_scores(str(sentence))
    channel_title_sentiment_vals.append(ss['pos']-ss['neg'])
    
for sentence in description:
    ss = sid.polarity_scores(str(sentence))
    description_sentiment_vals.append(ss['pos']-ss['neg'])
    
for sentence in channel_description:
    ss = sid.polarity_scores(str(sentence))
    channel_description_sentiment_vals.append(ss['pos']-ss['neg'])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/cameronyuen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [46]:
from sklearn import preprocessing

df['title'] = title_sentiment_vals
df['channelTitle'] = channel_title_sentiment_vals
df['description'] = description_sentiment_vals
df['Channel_description'] = channel_description_sentiment_vals

le = preprocessing.LabelEncoder()
df['trending?'] = le.fit_transform(df['trending?'])
df['ratings_disabled'] = le.fit_transform(df['ratings_disabled'])
df.head()

Unnamed: 0.1,Unnamed: 0,title,channelTitle,categoryId,duration,tags,ratings_disabled,description,Channel_viewCount,Channel_subscriberCount,...,Channel_description,view_count_update_11_19_14,likes_update_11_19_14,dislikes_update_11_19_14,comment_count_update_11_19_14,trending?,engagement_rate,INTL,UNK,USA
0,0,0.0,0.0,24.0,-0.0707230866960673,36.0,0,0.034,0.8199399529639629,0.504608,...,0.122,6.152175,3.391273,10.316363,2.855296,1,0.028492,0.0,0.0,1.0
1,1,0.0,0.0,25.0,-0.0856798827419832,28.0,0,0.048,-0.2749697772865863,-0.350255,...,0.0,3.001911,0.578677,4.999903,3.14225,1,0.090361,0.0,0.0,1.0
2,2,0.0,0.0,25.0,0.0948815828809148,13.0,0,0.062,0.6501905221829396,0.461214,...,0.107,1.368339,0.537524,2.314506,1.066382,1,0.006721,0.0,0.0,1.0
3,3,0.274,0.0,26.0,-0.0710470606176394,0.0,0,0.276,-0.2582170040256576,0.002973,...,0.0,1.882086,2.655548,2.774959,0.721674,1,0.045112,0.0,1.0,0.0
4,4,0.0,0.0,17.0,-0.0728829128398818,12.0,0,0.0,-0.3307112245402615,-0.407708,...,0.188,0.280922,-0.071268,0.049159,0.022729,1,0.029918,0.0,0.0,1.0


In [47]:
# dummy values for now - get these interactively later...
import tensorflow as tf

best_model = True
if best_model:
    num_hidden_layers = 3
    num_hidden_layer_nodes = [20, 10, 5]
    train_ratio = .7
    hidden_layer_activations = ['sigmoid', 'sigmoid', 'sigmoid', 'sigmoid', 'sigmoid']
    optimizer = 'sgd'
    learning_rate = .005
    loss = 'mean_squared_error'
    metrics = [tf.keras.metrics.Accuracy(),tf.keras.metrics.Recall(),tf.keras.metrics.Precision()]
    metrics_names = ["accuracy","recall","precision"]
    epochs = 300
    batch_size = 200
else:
    # build a custom model
    num_hidden_layers = 3
    num_hidden_layer_nodes = [20, 10, 5]
    train_ratio = .7
    hidden_layer_activations = ['sigmoid', 'sigmoid', 'sigmoid', 'sigmoid', 'sigmoid']
    optimizer = 'sgd'
    learning_rate = .005
    loss = 'mean_squared_error'
    metrics = ["accuracy"]
    metrics_names = ["accuracy"]
    epochs = 300
    batch_size = 200

In [48]:
# Split the data into training and testing set by 70:30
ratio = 0.7
train, test = train_test_split(df, train_size=ratio, random_state=42)
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

In [49]:
# separate data into x and y - just random y for now..
train_X = train.loc[:,train.columns != 'trending?']
train_Y = train['trending?']
test_X = test.loc[:,test.columns != 'trending?']
test_Y = test['trending?']

In [50]:
# build the logistic regression model - need clean data...
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression(multi_class='ovr')
LR_model.fit(train_X, train_Y)

ValueError: could not convert string to float: '"Awkward Puppets"'

In [17]:
# confusion matrix
from sklearn.metrics import confusion_matrix

pred_Y = LR_model.predict(test_X);
confusion_matrix(test_Y, pred_Y)

NameError: name 'test_X' is not defined

In [18]:
# evaluate the accuracy of the LR model
accuracy = LR_model.score(test_X, test_Y)
print(accuracy)

NameError: name 'test_X' is not defined

In [None]:
# build the ANN model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input

ANN_model = keras.Sequential()

# add input layerr
ANN_model.add(Input(shape=train_X.shape[1]))

# add hidden layers
for i in range(num_hidden_layers):
    ANN_model.add(Dense(num_hidden_layer_nodes[i], activation=hidden_layer_activations[i + 1]))

# add output layers
ANN_model.add(Dense(1, activation=hidden_layer_activations[len(hidden_layer_activations) - 1]))

ANN_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
ANN_model.summary()

In [None]:
# train the model
ANN_model.fit(train_X, train_Y, epochs=epochs, batch_size=batch_size)

In [None]:
# confusion matrix
pred_Y = ANN_model.predict_classes(test_X);
confusion_matrix(test_Y, pred_Y)

In [None]:
# report evaluation metrics 
evaluated_metrics = ANN_model.evaluate(test_X, test_Y)
for i in range(len(metrics)):
    print(metrics_names[i] + ": %.2f" % evaluated_metrics[i])