In [16]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import re
import datetime as dt

import keras as ks
import tensorflow as tf

import sklearn
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.neighbors
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import HashingVectorizer

In [17]:
def open_and_preprocess(filename):
    df = pd.read_csv(filename)
    df['abstract'] = df['abstract'].astype(str)
    df['week_day'] = df['pub_date'].map(lambda x: pd.Timestamp.to_pydatetime(pd.Timestamp(x)).weekday())
    df['pub_hour'] = df['pub_date'].map(lambda x: pd.Timestamp.to_pydatetime(pd.Timestamp(x)).hour)
    
    # List of the column names with features that should be ordinal encoded
    ordinal_features = ['newsdesk', 'section', 'material']

    # Ordinal encode all of these features
    ordinal = sklearn.preprocessing.OrdinalEncoder()
    df[ordinal_features] = ordinal.fit_transform(df[ordinal_features])

    vectorizer = HashingVectorizer(n_features=2**4)

#     # abstract
#     encoded_abstract = pd.DataFrame(vectorizer.fit_transform(df['abstract']).A)
#     df = pd.concat([df.reset_index(drop=True), encoded_abstract.reset_index(drop=True)], axis=1)

    # headline
    encoded_headline = pd.DataFrame(vectorizer.fit_transform(df['headline']).A)
    df = pd.concat([df.reset_index(drop=True), encoded_headline.reset_index(drop=True)], axis=1)
    
    df = df.drop(['uniqueID', 'subsection', 'pub_date', 'headline', 'keywords', 'abstract'], axis=1)
    print(df.columns)
    
    return df

In [18]:
df = open_and_preprocess("train.csv")

Index([  'newsdesk',    'section',   'material', 'word_count', 'is_popular',
       'n_comments',   'week_day',   'pub_hour',            0,            1,
                  2,            3,            4,            5,            6,
                  7,            8,            9,           10,           11,
                 12,           13,           14,           15],
      dtype='object')


In [19]:
y = df['is_popular']
# NOTE: REMOVING word_count DRASTICALLY IMPROVES ACCURACY
X = df.drop(['is_popular', 'word_count', 'n_comments'], axis=1)

In [20]:
X.columns

Index(['newsdesk',  'section', 'material', 'week_day', 'pub_hour',          0,
                1,          2,          3,          4,          5,          6,
                7,          8,          9,         10,         11,         12,
               13,         14,         15],
      dtype='object')

In [21]:
X.shape

(12792, 21)

In [22]:
# PREDICTING WITH KNN

# 80/20 train/test split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, shuffle=True)

# create classifiers
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)

# train classifiers
knn.fit(X_train, y_train)

# predict
y_test_pred = knn.predict(X_test)

accuracy = sklearn.metrics.accuracy_score(y_test, y_test_pred)
print(accuracy)

0.7311449785072294


In [23]:
print(X_train.shape)
print(X_test.shape)

print(len(y_train))
print(len(y_test))

print(np.unique(y_train, return_counts=True)[1])
print(np.unique(y_test, return_counts=True)[1])

(10233, 21)
(2559, 21)
10233
2559
[5152 5081]
[1292 1267]


In [24]:
shape = X_train.shape[1]

In [25]:
model = ks.models.Sequential()
model.add(ks.layers.Flatten(input_shape=[shape]))
model.add(ks.layers.Dense(256, activation="relu"))
model.add(ks.layers.Dense(128, activation="relu"))
model.add(ks.layers.Dense(64, activation="relu"))
model.add(ks.layers.Dense(32, activation="relu"))
model.add(ks.layers.Dense(2, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, batch_size=128, epochs=30, validation_split=0.1)
test_predictions = np.argmax(model.predict(X_test), axis=1)
test_accuracy = metrics.accuracy_score(y_test, test_predictions)
print(f"The test accuracy is {test_accuracy}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
The test accuracy is 0.722547870261821


In [26]:
# Predicting on the test set:
ts = open_and_preprocess("test.csv")

Index([  'newsdesk',    'section',   'material', 'word_count', 'is_popular',
         'week_day',   'pub_hour',            0,            1,            2,
                  3,            4,            5,            6,            7,
                  8,            9,           10,           11,           12,
                 13,           14,           15],
      dtype='object')


In [27]:
labels = ts['is_popular']
ts = ts.drop(['is_popular', 'word_count'], axis=1)

In [28]:
# SHALLOW MODEL PREDICTION
y_test_pred = knn.predict(ts)
accuracy = sklearn.metrics.accuracy_score(labels, y_test_pred)
print(f"The test accuracy for the shallow model on the test set is {accuracy}")

The test accuracy for the shallow model on the test set is 0.609261576971214


In [29]:
# SEQUENTIAL MODEL PREDICTION
test_predictions = np.argmax(model.predict(ts), axis=1)
test_accuracy = metrics.accuracy_score(labels, test_predictions)
print(f"The test accuracy for the sequential model on the test set is {test_accuracy}")

The test accuracy for the sequential model on the test set is 0.7158948685857321
