In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import re
import datetime as dt

import keras as ks
import tensorflow as tf

import sklearn
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.neighbors
import sklearn.ensemble
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Helper functions

In [2]:
def get_popular_words(corpus, top_n):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    feature_array = vectorizer.get_feature_names()
    top_words = sorted(list(zip(vectorizer.get_feature_names(), X.sum(0).getA1())), key=lambda x: x[1], reverse=True)[:top_n]
    result = [x[0] for x in top_words]
    print(top_words)
    return result

In [3]:
def get_pw_from_file(filename, column, n_top):
    df = pd.read_csv(filename)
    df['all_text'] = df['headline'] + " " + df['abstract'] + " " + df['keywords']
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) 
    df[column] = df[column].astype(str)
    df = lemmatize_column(df, column, lemmatizer, stop_words)
    popular_words = get_popular_words(df[column], n_top)
    
    return popular_words

In [4]:
def process_sentence(sentence, lemmatizer, stop_words):
    sentence = sentence.lower()
    tokens = list(set(word_tokenize(sentence)))   
    words = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(words)

In [5]:
def ord_encode(df, ordinal_features):
    # Ordinal encode all of these features
    ordinal = sklearn.preprocessing.OrdinalEncoder()
    df[ordinal_features] = ordinal.fit_transform(df[ordinal_features])
    return df

In [6]:
def encode_language_column(df, col_name, popular_words = []):
    vectorizer = CountVectorizer()
    nc = vectorizer.fit_transform(df[col_name])
    encoded_col = pd.DataFrame(nc.A, columns=vectorizer.get_feature_names())[popular_words]
    df = pd.concat([df.reset_index(drop=True), encoded_col.reset_index(drop=True)], axis=1)
    return df

In [7]:
def lemmatize_column(df, col_name, lemmatizer, stop_words):
    df[col_name] = df[col_name].map(lambda x: process_sentence(x, lemmatizer, stop_words))
    return df

In [8]:
def open_and_preprocess(filename):
    df = pd.read_csv(filename)    
    
    # create 3 new columns
    df['week_day'] = df['pub_date'].map(lambda x: pd.Timestamp.to_pydatetime(pd.Timestamp(x)).weekday())
    df['pub_hour'] = df['pub_date'].map(lambda x: pd.Timestamp.to_pydatetime(pd.Timestamp(x)).hour)
    df['all_text'] = df['headline'] + " " + df['abstract'] + " " + df['keywords']
    
    # ordinal encode
    df = ord_encode(df, ['newsdesk', 'section', 'material'])
    
    df = df.drop(['uniqueID', 'subsection', 'pub_date', 'headline', 'abstract', 'keywords'], axis=1)
    
    return df

In [9]:
def process_column(df, column, n_top, popular_words = []):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) 
    df[column] = df[column].astype(str)
    df = lemmatize_column(df, column, lemmatizer, stop_words)
    if len(popular_words) == 0:
        popular_words = get_popular_words(df[column], n_top)
    df = encode_language_column(df, column, popular_words)
    df = df.drop([column], axis=1)
    return df

In [10]:
def base_optimize_DT(X_train, y_train):
    # REWRITE AS NEEDED FOR DIFFERENT MODELS
    
    # Cross-validation folds
    k = 10

    # Hyperparameters to tune:
    params = {'min_samples_split': [2, 5, 10],
             'criterion': ['gini', 'entropy'],
              'max_depth': [5, 10, 20],
              'min_samples_leaf': [1, 2, 5, 10]
             }
    
    # Initialize GridSearchCV object with decision tree classifier and hyperparameters
    grid_tree = sklearn.model_selection.GridSearchCV(estimator=sklearn.tree.DecisionTreeClassifier(),
                             param_grid=params,
                             cv=k,
                             return_train_score=True,
                             scoring='accuracy',
                             refit='accuracy') 

    # Train and cross-validate, print results
    grid_tree.fit(X_train, y_train)

    best_hyperparams = grid_tree.best_params_

    # print best hyperparameters
    print(best_hyperparams)

In [11]:
def base_optimize_KNN(X_train, y_train):
    # REWRITE AS NEEDED FOR DIFFERENT MODELS
    
    # Cross-validation folds
    k = 10

    # Hyperparameters to tune:
    params = {'n_neighbors': [3, 5, 8, 10, 15],
                'weights': ['uniform', 'distance'],
             }
    
    # Initialize GridSearchCV object with decision tree classifier and hyperparameters
    grid_tree = sklearn.model_selection.GridSearchCV(estimator=sklearn.neighbors.KNeighborsClassifier(),
                             param_grid=params,
                             cv=k,
                             return_train_score=True,
                             scoring='accuracy',
                             refit='accuracy') 

    # Train and cross-validate, print results
    grid_tree.fit(X_train, y_train)

    best_hyperparams = grid_tree.best_params_

    # print best hyperparameters
    print(best_hyperparams)

In [12]:
def base_optimize_SVM(X_train, y_train):
    # REWRITE AS NEEDED FOR DIFFERENT MODELS
    
    # Cross-validation folds
    k = 10

    # Hyperparameters to tune:
    params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
             'decision_function_shape': ['ovo', 'ovr']
             }
    
    # Initialize GridSearchCV object with decision tree classifier and hyperparameters
    grid_tree = sklearn.model_selection.GridSearchCV(estimator=sklearn.svm.SVC(),
                             param_grid=params,
                             cv=k,
                             return_train_score=True,
                             scoring='accuracy',
                             refit='accuracy') 

    # Train and cross-validate, print results
    grid_tree.fit(X_train, y_train)

    best_hyperparams = grid_tree.best_params_

    # print best hyperparameters
    print(best_hyperparams)

# Start here

In [13]:
n_top = 150
text_column_to_change = 'all_text'
df = open_and_preprocess("train.csv")
df = process_column(df, text_column_to_change, n_top)
y_train = df['is_popular']
# NOTE: REMOVING word_count DRASTICALLY IMPROVES ACCURACY
X_train = df.drop(['is_popular', 'n_comments', 'word_count'], axis=1)
ts = open_and_preprocess("test.csv")
ts = process_column(ts, text_column_to_change, n_top, get_pw_from_file('train.csv', text_column_to_change, n_top))
y_test = ts['is_popular']
X_test = ts.drop(['is_popular', 'word_count'], axis=1)

In [None]:
# base_optimize_DT(X_train, y_train)
# base_optimize_KNN(X_train, y_train)
base_optimize_SVM(X_train, y_train)

In [26]:
# PREDICTING WITH KNN
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=15, weights='distance')
knn.fit(X_train, y_train)
y_test_pred = knn.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test, y_test_pred)
print(accuracy)

# only headline: {'n_neighbors': 15, 'weights': 'uniform'}
# all_text: {'n_neighbors': 15, 'weights': 'distance'}

0.6823529411764706


In [27]:
# PREDICTING WITH DECISION TREE
dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=10, min_samples_split=2)
dt.fit(X_train, y_train)
y_test_pred = dt.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test, y_test_pred)
print(accuracy)

# only headline: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 10, 'min_samples_split': 5}
# all_text: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}

0.6645807259073843


In [30]:
# PREDICTING WITH SVM
svm = sklearn.svm.SVC()
svm.fit(X_train, y_train)
y_test_pred = svm.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_test, y_test_pred)
print(accuracy)

0.6795994993742178


In [28]:
shape = X_train.shape[1]

In [29]:
model = ks.models.Sequential()
model.add(ks.layers.Flatten(input_shape=[shape]))
model.add(ks.layers.Dense(256, activation="relu"))
model.add(ks.layers.Dense(128, activation="relu"))
model.add(ks.layers.Dense(64, activation="relu"))
model.add(ks.layers.Dense(32, activation="relu"))
model.add(ks.layers.Dense(2, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, batch_size=128, epochs=30, validation_split=0.1)
test_predictions = np.argmax(model.predict(X_test), axis=1)
test_accuracy = metrics.accuracy_score(y_test, test_predictions)
print(f"The test accuracy is {test_accuracy}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
The test accuracy is 0.7294117647058823
