In [22]:
import pandas as pd # Library to load tabular data
pd.set_option('max_columns', 50)

# Loading the data

Note that here I'm pretending like 10% of the data is unlabeled

In [23]:
all_data = pd.read_csv('https://raw.githubusercontent.com/mkleinbort/resource-datasets/master/ny_times_comments/CommentsApril2017_9000_interesting_columns.csv')

data_train = all_data.loc[lambda x: x.index%10 != 0]
data_test = all_data.loc[lambda x: x.index%10 == 0].drop(columns=['editorsSelection'])

In [24]:
data_train.shape

(8100, 22)

In [25]:
data_test.shape

(900, 21)

# What do we want to learn?

In [26]:
y = data_train['editorsSelection'] # Did the NY Times Editor choose to feature the comment

# What do we have available to make this prediction?

Here we need to think a little, and figure out what will be available at the time when we want to make our prediction. We have many features including:
- approveDate
- commentBody
- commentType
- createDate
- parentUserDisplayName
- replyCoun
- sharing
- timespeople
- trusted
- updateDate
- userDisplayNam
- userLocation
- inReplyTo
- sectionName
- newDesk
- articleWordCount
- printPage
- typeOfMaterial


But many of these features are only available AFTER the prediction would have been useful. For example, is it useful to predict whether a comment will be featured by the editor if we MUST know ahead of time how many people will reply to the comment?

For right now, let's assume we only know:
- The articleWordCount
- The typeOfMaterial
- The commentBody
- The commentType

# Model Part 1: Prepare the data

Somehow we need to make X into a table of only numbers. However, X is far from being numeric right now

In [32]:
FEATURES = ['articleWordCount','typeOfMaterial','commentType', 'commentBody']

X = data_train.loc[:, FEATURES] # Select all rows, and only these columns

X.head()

Unnamed: 0,articleWordCount,typeOfMaterial,commentType,commentBody
1,828.0,Op-Ed,comment,Mining for coal and oil have caused untold dis...
2,648.0,News,comment,Congratulations NYT for excellent work for you...
3,1226.0,News,comment,"If you want to beat Facebook, just provide vid..."
4,4611.0,News,comment,will trump turn out to be the American Right's...
5,1358.0,Op-Ed,comment,King was not the happy warrior that Common Wis...


We need to make each row into only numbers. Here is one approach:

In [40]:
# Note: This is very slow code, and not "The way to do it", but it's easy to read

def turn_row_into_numbers(row):
    answer = {} # Start a new dictionary
    
    answer['articleWordCount'] = row['articleWordCount']
    answer['commentWordCount'] = len(row['commentBody'].split())
    answer['typeOfMaterial: ' + row['typeOfMaterial']] = 1
    answer['commentType: ' + row['commentType']] = 1
    
    return pd.Series(answer) # This will make the full output a dataframe

X_train = (X
 .apply(turn_row_into_numbers, axis=1)  # Axis=1 means "apply this function to each row"
 .fillna(0) # The way we turned the row into numbers, this is needed to fill in the 
)

In [41]:
X2.head()

Unnamed: 0,articleWordCount,commentType: comment,commentType: reporterReply,commentType: userReply,commentWordCount,typeOfMaterial: Blog,typeOfMaterial: Brief,typeOfMaterial: Editorial,typeOfMaterial: News,typeOfMaterial: News Analysis,typeOfMaterial: Obituary (Obit),typeOfMaterial: Op-Ed,typeOfMaterial: Review,typeOfMaterial: briefing
1,828.0,1.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,648.0,1.0,0.0,0.0,149.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1226.0,1.0,0.0,0.0,16.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4611.0,1.0,0.0,0.0,27.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,1358.0,1.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Part Model Part 1 - Continued: Prepare the test data In exactly the same way 

It's very easy for things to go wrong here. As a gut feeling, most of all hard to debug issues come from the test data being processed slightly differently from the training data.

In [45]:
# Again, this is not "good" code, just showing how it could be done

X_test = (data_test
             .loc[:, FEATURES] # Select the same features
             .apply(turn_row_into_numbers, axis=1)  # Apply the same transformations
             .loc[:, X_train.columns] # Keep columns in the same order
             .fillna(0) # Fill in missing numbers the same way
         )

X_test.head()

Unnamed: 0,articleWordCount,commentType: comment,commentType: reporterReply,commentType: userReply,commentWordCount,typeOfMaterial: Blog,typeOfMaterial: Brief,typeOfMaterial: Editorial,typeOfMaterial: News,typeOfMaterial: News Analysis,typeOfMaterial: Obituary (Obit),typeOfMaterial: Op-Ed,typeOfMaterial: Review,typeOfMaterial: briefing
0,1230.0,0.0,0.0,1.0,16.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10,1418.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
20,1250.0,1.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30,3770.0,0.0,0.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
40,1376.0,1.0,0.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Model Part 2: Building and training an ML model

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

model = GridSearchCV(RandomForestClassifier(n_estimators=200), param_grid={}, cv=5)

model.fit(X_train, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False,
                                              rando

# Part 3: Estimating model performance and making predictions

In [51]:
y.value_counts(normalize=True) # Check accuracy of a random guess

True     0.5
False    0.5
Name: editorsSelection, dtype: float64

In [52]:
# This trained only one model, with estimated accuracy of 66% (as seen in column: 'mean_test_score')
pd.DataFrame(model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.479813,0.070108,0.092845,0.007396,{},0.662346,0.656173,0.671605,0.666049,0.640741,0.659383,0.010587,1


this is good in comparison to a random guess, which would only have been 50% accurate

### Making Predictions

In [55]:
prediction_values = model.predict(X_test)
predictions = pd.Series(prediction_values, X_test.index)

predictions.head()

0     False
10    False
20    False
30    False
40    False
dtype: bool