In [None]:
import pandas as pd # Library to load tabular data
pd.set_option('max_columns', 50)

# Loading the data (no changes)

Note that here I'm pretending like 10% of the data is unlabeled

In [None]:
all_data = pd.read_csv('https://raw.githubusercontent.com/mkleinbort/resource-datasets/master/ny_times_comments/CommentsApril2017_9000_interesting_columns.csv')

data_train = all_data.loc[lambda x: x.index%10 != 0]
data_test = all_data.loc[lambda x: x.index%10 == 0].drop(columns=['editorsSelection'])

In [None]:
data_train.shape

In [None]:
data_test.shape

# What do we want to learn? (no changes)

In [None]:
y = data_train['editorsSelection'] # Did the NY Times Editor choose to feature the comment

# What do we have available to make this prediction?

Here we need to think a little, and figure out what will be available at the time when we want to make our prediction. We have many features including:
- approveDate
- commentBody
- commentType
- createDate
- parentUserDisplayName
- replyCoun
- sharing
- timespeople
- trusted
- updateDate
- userDisplayNam
- userLocation
- inReplyTo
- sectionName
- newDesk
- articleWordCount
- printPage
- typeOfMaterial


But many of these features are only available AFTER the prediction would have been useful. For example, is it useful to predict whether a comment will be featured by the editor if we MUST know ahead of time how many people will reply to the comment?

For right now, let's assume we only know:
- The articleWordCount
- The typeOfMaterial
- The commentBody
- The commentType

# Model Part 1: Prepare the data

Somehow we need to make X into a table of only numbers. However, X is far from being numeric right now

In [None]:
FEATURES = ['articleWordCount','typeOfMaterial','commentType', 'commentBody']

X = data_train.loc[:, FEATURES] # Select all rows, and only these columns

X.head()

# Text Vectorization

We need to make each row into only numbers. Here is one approach:

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

# Takes a long time
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:

embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings)

In [None]:
X_numeric = X.select_dtypes('number')

X_text_values = embed(X['commentBody'].to_list())
X_text = pd.DataFrame(data=X_text_values, columns=vectorizer.vocabulary_, index=X.index)

X_text.head()

In [None]:
X_train = pd.concat([X_numeric, X_text], axis=1)

# Part Model Part 1 - Continued: Prepare the test data In exactly the same way 

It's very easy for things to go wrong here. As a gut feeling, most of all hard to debug issues come from the test data being processed slightly differently from the training data.

In [None]:
# Again, this is not "good" code, just showing how it could be done


X_numeric_test = data_test.loc[:, FEATURES].select_dtypes('number')

X_text_values_test = embed(data_test['commentBody'].to_list())
X_text_test = pd.DataFrame(data=X_text_values_test, columns=vectorizer.vocabulary_, index=data_test.index)


X_test = pd.concat([X_numeric_test, X_text_test], axis=1)

X_test.head()

In [None]:
assert all(X_train.columns == X_test.columns) # Quick Check

# Model Part 2: Building and training an ML model

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

model = GridSearchCV(CatBoostClassifier(n_estimators=500, metric_period=100), param_grid={}, cv=3)

model.fit(X_train, y)

# Part 3: Estimating model performance and making predictions

In [None]:
y.value_counts(normalize=True) # Check accuracy of a random guess

In [None]:
# This trained only one model, with estimated accuracy of 67% (as seen in column: 'mean_test_score')
pd.DataFrame(model.cv_results_)

this is good in comparison to a random guess, which would only have been 50% accurate

### Making Predictions

In [None]:
prediction_values = model.predict(X_test)
predictions = pd.Series(prediction_values, X_test.index)

predictions.head()