In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import re
import datetime as dt

import keras as ks
import tensorflow as tf

import sklearn
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.neighbors
import sklearn.ensemble
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
def get_popular_words(corpus, top_n):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    feature_array = vectorizer.get_feature_names()
    top_words = sorted(list(zip(vectorizer.get_feature_names(), X.sum(0).getA1())), key=lambda x: x[1], reverse=True)[:top_n]
    result = [x[0] for x in top_words]
    print(top_words)
    return result

In [3]:
def get_pw_from_file(filename, column, n_top):
    df = pd.read_csv(filename)
    
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) 
    df[column] = df[column].astype(str)
    df = lemmatize_column(df, column, lemmatizer, stop_words)
    popular_words = get_popular_words(df[column], n_top)
    
    return popular_words

In [4]:
def process_sentence(sentence, lemmatizer, stop_words):
    sentence = sentence.lower()
    tokens = list(set(word_tokenize(sentence)))   
    words = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(words)

In [5]:
def ord_encode(df, ordinal_features):
    # Ordinal encode all of these features
    ordinal = sklearn.preprocessing.OrdinalEncoder()
    df[ordinal_features] = ordinal.fit_transform(df[ordinal_features])
    return df

In [6]:
def encode_language_column(df, col_name, popular_words = []):
    vectorizer = CountVectorizer()
    nc = vectorizer.fit_transform(df[col_name])
    encoded_col = pd.DataFrame(nc.A, columns=vectorizer.get_feature_names())[popular_words]
    df = pd.concat([df.reset_index(drop=True), encoded_col.reset_index(drop=True)], axis=1)
    return df

In [7]:
def lemmatize_column(df, col_name, lemmatizer, stop_words):
    df[col_name] = df[col_name].map(lambda x: process_sentence(x, lemmatizer, stop_words))
    return df

In [8]:
def open_and_preprocess(filename):
    df = pd.read_csv(filename)
    # create 2 new columns
    df['week_day'] = df['pub_date'].map(lambda x: pd.Timestamp.to_pydatetime(pd.Timestamp(x)).weekday())
    df['pub_hour'] = df['pub_date'].map(lambda x: pd.Timestamp.to_pydatetime(pd.Timestamp(x)).hour)
    # ordinal encode
    df = ord_encode(df, ['newsdesk', 'section', 'material'])
    
    df = df.drop(['uniqueID', 'subsection', 'pub_date', 'headline', 'abstract'], axis=1)
    
    return df

In [9]:
def process_column(df, column, n_top, popular_words = []):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) 
    df[column] = df[column].astype(str)
    df = lemmatize_column(df, column, lemmatizer, stop_words)
    if len(popular_words) == 0:
        popular_words = get_popular_words(df[column], n_top)
    df = encode_language_column(df, column, popular_words)
    df = df.drop([column], axis=1)
    return df

In [10]:
df = open_and_preprocess("train.csv")
df = process_column(df, 'keywords', 100)

[('state', 3521), ('government', 2529), ('donald', 2080), ('politics', 2015), ('election', 1496), ('york', 1116), ('life', 1075), ('party', 1069), ('city', 870), ('jr', 823), ('people', 706), ('joseph', 686), ('estate', 669), ('ny', 638), ('housing', 632), ('protest', 620), ('residential', 613), ('service', 556), ('medium', 553), ('culture', 547), ('department', 546), ('reopenings', 539), ('movement', 537), ('program', 531), ('job', 516), ('institutional', 512), ('economy', 499), ('relation', 468), ('floyd', 444), ('literature', 421), ('book', 414), ('ethnicity', 411), ('tv', 410), ('medical', 404), ('international', 389), ('fatality', 376), ('nyc', 362), ('health', 360), ('security', 359), ('girl', 342), ('university', 338), ('puzzle', 337), ('misconduct', 333), ('public', 332), ('matter', 329), ('shooting', 323), ('act', 322), ('caucus', 322), ('manhattan', 317), ('force', 316), ('military', 289), ('internet', 288), ('cookbook', 286), ('riot', 286), ('bernard', 285), ('disease', 284)

In [11]:
df

Unnamed: 0,newsdesk,section,material,word_count,is_popular,n_comments,week_day,pub_hour,state,government,...,world,china,play,prevention,warming,movie,concern,stephen,elizabeth,war
0,11.0,17.0,0.0,680,1,186,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,7.0,4.0,931,1,257,2,3,0,0,...,0,0,0,0,0,0,0,0,0,0
2,41.0,22.0,4.0,1057,0,6,2,5,0,0,...,0,0,0,0,0,0,0,0,0,0
3,41.0,22.0,1.0,0,0,2,2,5,0,0,...,0,0,0,0,0,0,0,0,0,0
4,41.0,22.0,4.0,1156,0,25,2,5,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12787,14.0,39.0,4.0,1297,1,143,2,18,0,1,...,0,0,0,0,0,0,0,0,0,0
12788,34.0,17.0,7.0,88,0,33,2,20,0,0,...,0,0,0,0,0,0,0,0,0,0
12789,11.0,17.0,0.0,991,1,1516,2,21,0,0,...,0,0,0,0,0,0,0,0,0,0
12790,39.0,35.0,4.0,1709,1,702,2,23,1,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
y = df['is_popular']
# NOTE: REMOVING word_count DRASTICALLY IMPROVES ACCURACY
X = df.drop(['is_popular', 'n_comments', 'word_count'], axis=1)

In [13]:
X.shape

(12792, 105)

In [14]:
# PREDICTING WITH KNN

# 80/20 train/test split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, shuffle=True)

# create classifiers
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)

# train classifiers
knn.fit(X_train, y_train)

# predict
y_test_pred = knn.predict(X_test)

accuracy = sklearn.metrics.accuracy_score(y_test, y_test_pred)
print(accuracy)

0.738569753810082


In [15]:
print(X_train.shape)
print(X_test.shape)

print(len(y_train))
print(len(y_test))

print(np.unique(y_train, return_counts=True)[1])
print(np.unique(y_test, return_counts=True)[1])

(10233, 105)
(2559, 105)
10233
2559
[5170 5063]
[1274 1285]


In [16]:
shape = X_train.shape[1]

In [17]:
model = ks.models.Sequential()
model.add(ks.layers.Flatten(input_shape=[shape]))
model.add(ks.layers.Dense(256, activation="relu"))
model.add(ks.layers.Dense(128, activation="relu"))
model.add(ks.layers.Dense(64, activation="relu"))
model.add(ks.layers.Dense(32, activation="relu"))
model.add(ks.layers.Dense(2, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, batch_size=128, epochs=30, validation_split=0.1)
test_predictions = np.argmax(model.predict(X_test), axis=1)
test_accuracy = metrics.accuracy_score(y_test, test_predictions)
print(f"The test accuracy is {test_accuracy}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
The test accuracy is 0.7346619773348965


In [28]:
# Predicting on the test set:
ts = open_and_preprocess("test.csv")
ts = process_column(ts, 'abstract', 300, get_pw_from_file('train.csv', 'abstract', 300))

labels = ts['is_popular']
ts = ts.drop(['is_popular', 'word_count'], axis=1)

# SHALLOW MODEL PREDICTION
y_test_pred = knn.predict(ts)
accuracy = sklearn.metrics.accuracy_score(labels, y_test_pred)
print(f"The test accuracy for the shallow model on the test set is {accuracy}")

# SEQUENTIAL MODEL PREDICTION
test_predictions = np.argmax(model.predict(ts), axis=1)
test_accuracy = metrics.accuracy_score(labels, test_predictions)
print(f"The test accuracy for the sequential model on the test set is {test_accuracy}")

[('new', 1450), ('coronavirus', 1066), ('president', 961), ('trump', 726), ('pandemic', 716), ('state', 646), ('time', 609), ('people', 572), ('city', 564), ('said', 497), ('year', 490), ('york', 479), ('home', 445), ('country', 417), ('american', 410), ('say', 408), ('like', 384), ('make', 370), ('week', 369), ('world', 368), ('life', 358), ('way', 341), ('health', 321), ('help', 309), ('virus', 309), ('case', 303), ('official', 299), ('student', 298), ('work', 289), ('house', 273), ('day', 262), ('business', 258), ('school', 255), ('woman', 253), ('need', 252), ('company', 250), ('look', 241), ('death', 240), ('public', 239), ('outbreak', 234), ('family', 226), ('child', 225), ('change', 223), ('long', 220), ('crisis', 214), ('government', 214), ('million', 213), ('social', 211), ('biden', 208), ('black', 208), ('month', 196), ('political', 195), ('story', 195), ('worker', 195), ('right', 189), ('white', 189), ('want', 187), ('republican', 184), ('party', 180), ('plan', 178), ('come'