In [1]:
import pandas as pd # Library to load tabular data
pd.set_option('max_columns', 50)

# Loading the data (no changes)

Note that here I'm pretending like 10% of the data is unlabeled

In [2]:
all_data = pd.read_csv('https://raw.githubusercontent.com/mkleinbort/resource-datasets/master/ny_times_comments/CommentsApril2017_9000_interesting_columns.csv')

data_train = all_data.loc[lambda x: x.index%10 != 0]
data_test = all_data.loc[lambda x: x.index%10 == 0].drop(columns=['editorsSelection'])

In [3]:
data_train.shape

(8100, 22)

In [4]:
data_test.shape

(900, 21)

# What do we want to learn? (no changes)

In [6]:
y = data_train['editorsSelection'] # Did the NY Times Editor choose to feature the comment

# What do we have available to make this prediction?

Here we need to think a little, and figure out what will be available at the time when we want to make our prediction. We have many features including:
- approveDate
- commentBody
- commentType
- createDate
- parentUserDisplayName
- replyCoun
- sharing
- timespeople
- trusted
- updateDate
- userDisplayNam
- userLocation
- inReplyTo
- sectionName
- newDesk
- articleWordCount
- printPage
- typeOfMaterial


But many of these features are only available AFTER the prediction would have been useful. For example, is it useful to predict whether a comment will be featured by the editor if we MUST know ahead of time how many people will reply to the comment?

For right now, let's assume we only know:
- The articleWordCount
- The typeOfMaterial
- The commentBody
- The commentType

# Model Part 1: Prepare the data

Somehow we need to make X into a table of only numbers. However, X is far from being numeric right now

In [7]:
FEATURES = ['articleWordCount','typeOfMaterial','commentType', 'commentBody']

X = data_train.loc[:, FEATURES] # Select all rows, and only these columns

X.head()

Unnamed: 0,articleWordCount,typeOfMaterial,commentType,commentBody
1,828.0,Op-Ed,comment,Mining for coal and oil have caused untold dis...
2,648.0,News,comment,Congratulations NYT for excellent work for you...
3,1226.0,News,comment,"If you want to beat Facebook, just provide vid..."
4,4611.0,News,comment,will trump turn out to be the American Right's...
5,1358.0,Op-Ed,comment,King was not the happy warrior that Common Wis...


# Bag of Words

We need to make each row into only numbers. Here is one approach:

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [17]:
vectorizer = CountVectorizer(stop_words=['and','but','or','them','he','they','for','to','the','these','are','from','of','in','we','is','our','your'],  # Or use a pre-existing list
                             min_df=50 # Minumum amount of times a word has to appear to be included as a column
                            ) 

X_numeric = X.select_dtypes('number')

X_text_values = vectorizer.fit_transform(X['commentBody']).todense()
X_text = pd.DataFrame(data=X_text_values, columns=vectorizer.vocabulary_, index=X.index)

X_text.head()

Unnamed: 0,coal,oil,have,caused,both,environment,events,climate,change,issue,think,disaster,mexico,br,need,energy,even,without,what,happening,nyt,excellent,work,international,efforts,...,opposition,helped,recent,budget,danger,decided,shows,twice,pressure,decisions,propaganda,sides,model,student,generation,adults,previous,hour,fault,century,maintain,reasonable,positions,area,opposed
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
X_train = pd.concat([X_numeric, X_text], axis=1)

# Part Model Part 1 - Continued: Prepare the test data In exactly the same way 

It's very easy for things to go wrong here. As a gut feeling, most of all hard to debug issues come from the test data being processed slightly differently from the training data.

In [22]:
# Again, this is not "good" code, just showing how it could be done


X_numeric_test = data_test.loc[:, FEATURES].select_dtypes('number')

X_text_values_test = vectorizer.transform(data_test['commentBody']).todense() # Use .transform(), not .fit_transform()
X_text_test = pd.DataFrame(data=X_text_values_test, columns=vectorizer.vocabulary_, index=data_test.index)


X_test = pd.concat([X_numeric_test, X_text_test], axis=1)

X_test.head()

Unnamed: 0,articleWordCount,coal,oil,have,caused,both,environment,events,climate,change,issue,think,disaster,mexico,br,need,energy,even,without,what,happening,nyt,excellent,work,international,...,opposition,helped,recent,budget,danger,decided,shows,twice,pressure,decisions,propaganda,sides,model,student,generation,adults,previous,hour,fault,century,maintain,reasonable,positions,area,opposed
0,1230.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,1418.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20,1250.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30,3770.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
40,1376.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
assert all(X_train.columns == X_test.columns) # Quick Check

# Model Part 2: Building and training an ML model

In [29]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

model = GridSearchCV(CatBoostClassifier(n_estimators=500, metric_period=100), param_grid={}, cv=3)

model.fit(X_train, y)

Learning rate set to 0.039968
0:	learn: 0.6901398	total: 23.5ms	remaining: 11.7s
100:	learn: 0.5975183	total: 1.63s	remaining: 6.43s
200:	learn: 0.5694631	total: 3.2s	remaining: 4.76s
300:	learn: 0.5310922	total: 4.74s	remaining: 3.13s
400:	learn: 0.4955140	total: 6.23s	remaining: 1.54s
499:	learn: 0.4663630	total: 7.83s	remaining: 0us
Learning rate set to 0.039968
0:	learn: 0.6895139	total: 18.8ms	remaining: 9.38s
100:	learn: 0.5929121	total: 1.53s	remaining: 6.05s
200:	learn: 0.5642350	total: 3.26s	remaining: 4.84s
300:	learn: 0.5248932	total: 4.85s	remaining: 3.21s
400:	learn: 0.4882168	total: 6.44s	remaining: 1.59s
499:	learn: 0.4564862	total: 8.1s	remaining: 0us
Learning rate set to 0.039968
0:	learn: 0.6888222	total: 26.7ms	remaining: 13.3s
100:	learn: 0.5954403	total: 1.53s	remaining: 6.03s
200:	learn: 0.5649470	total: 3s	remaining: 4.47s
300:	learn: 0.5240601	total: 4.55s	remaining: 3.01s
400:	learn: 0.4860592	total: 6.04s	remaining: 1.49s
499:	learn: 0.4556284	total: 7.53s	rem

GridSearchCV(cv=3, error_score=nan,
             estimator=<catboost.core.CatBoostClassifier object at 0x1276bcac8>,
             iid='deprecated', n_jobs=None, param_grid={},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

# Part 3: Estimating model performance and making predictions

In [30]:
y.value_counts(normalize=True) # Check accuracy of a random guess

True     0.5
False    0.5
Name: editorsSelection, dtype: float64

In [32]:
# This trained only one model, with estimated accuracy of 67% (as seen in column: 'mean_test_score')
pd.DataFrame(model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,8.174456,0.238868,0.104267,0.003121,{},0.668519,0.658889,0.677778,0.668395,0.007712,1


this is good in comparison to a random guess, which would only have been 50% accurate

### Making Predictions

In [33]:
prediction_values = model.predict(X_test)
predictions = pd.Series(prediction_values, X_test.index)

predictions.head()

0     False
10    False
20    False
30    False
40    False
dtype: object