In [1]:
reset -fs

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
from joblib import load
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

  from numpy.core.umath_tests import inner1d


In [4]:
%matplotlib inline
RANDOM_STATE = 28

# Predicting New York Times Section Names

In [29]:
import random
names = ['Paul Kim', 'Philip Trinh', 'Nina Hua', 'Jacques Sham', 'Evan Liu']
authors = random.sample(names, 5)
print('By: ' + ', '.join(authors))

By: Paul Kim, Nina Hua, Philip Trinh, Evan Liu, Jacques Sham


Talk about the data that we are working with

# Goal:
Our goal in this project was to predict the section that the article lives under (i.e. Sports, Politics, Media, etc.)


Here's a look at some data we were working with:

In [6]:
import numpy as np
import pandas as pd

articles = 'Data/ArticlesApril2018.csv'
arts = pd.read_csv(articles)
arts.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [7]:
comments = 'Data/CommentsApril2018.csv'
comm = pd.read_csv(comments)

comm.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,,


In [34]:
comm

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,,
5,1524594689,5adf653f068401528a2aa697,656.0,Pruitt: First eliminate the scientists and no...,26854140.0,26854140.0,<br/>,comment,1524594684,1.0,...,approved,1,0,News,1524594689,Glennmr,449122.0,Planet Earth,,
6,1524594621,5adf653f068401528a2aa697,656.0,This is a shameful attack on the environment a...,26854122.0,26854122.0,<br/>,comment,1524594612,1.0,...,approved,1,1,News,1524594621,Suzanne Moniz,63591393.0,Providence,,
7,1524594485,5adf653f068401528a2aa697,656.0,“It gives people the opportunity in real time ...,26854086.0,26854086.0,<br/>,comment,1524594482,1.0,...,approved,1,0,News,1524594485,NYC Physician,67346507.0,Manhattan,,
8,1524593688,5adf653f068401528a2aa697,656.0,Given recent revelations about Mr. Pruitt's mo...,26853868.0,26853868.0,<br/>,comment,1524593682,1.0,...,approved,0,0,News,1524593688,EBD,73902638.0,"Aiken, SC",,
9,1524594585,5adf4626068401528a2aa628,2427.0,I loved this review without wanting to actuall...,26854114.0,26854114.0,<br/>,comment,1524594583,1.0,...,approved,1,0,News,1524594585,SLCmama,21817156.0,Los Angeles,,


The pre-processing we did was to take out the 'Unknown' section name and ensure the comment type was 'comment' rather than 'userReply'. 

These were the initial efforts to minimize noise - later, we implemented pre-processing to handle 'Unknown's. More on this later.

# Modeling Methodology

1. Try modeling using 1 month of data (the datasets are large), select the best model, then train on 9 months of data.
2. Use commentBody to predict the section name.
3. Use newDesk and typeOfMaterial to predict.
4. Pre-processing: Removed the 'Unknown' sections and the 'userReply' comment types.
5. Performed Grid Search on all models.

Q: Train-test split on the 9 months?

The models that we tried:
1. Perceptron
2. Naive Bayes
3. SVM
4. Boosting with all of the above

Q: what kind of vectorizers! (Count and TFIDF)

# Let's take a look at some models!

Perceptron: Linear binary classifier, usually. Basically, we have weighted sum of inputs that we feed into an activation function that maps the values into the correct values we need.

In [30]:
# Comments File
comments_file = 'Data/CommentsApril2018.csv'

# Read in File
comments = pd.read_csv(comments_file)

# Filter out articles with "Unknown" Section Name
filt = (comments.sectionName != "Unknown") & (comments.commentType == "comment")
data = comments.commentBody[filt]
sections = comments.sectionName[filt]

# Train/Test Split
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

Question on methodology?

## Perceptron

In [9]:
per_model = load('Models/perceptron_small.joblib') 

In [10]:
per_model.score(test_data, test_target)

0.7141440682167468

In [12]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

booster_per_model = Pipeline([('vec', vectorizer),
                    ('boo', AdaBoostClassifier(per_model.get_params()['clf'], algorithm='SAMME'))])

booster_per_model.fit(train_data, train_target)

booster_per_model.score(test_data, test_target)

0.5282831788210798

## Naive Bayes

In [13]:
nb_model = load('Models/naive_bayes_small.joblib') 

In [14]:
nb_model.score(test_data, test_target)

0.7232164989341133

In [15]:
vectorizer = CountVectorizer(decode_error='ignore',
                             stop_words='english')

booster_NB_model = Pipeline([('vec', vectorizer),
                    ('boo', AdaBoostClassifier(nb_model.get_params()['clf']))])

booster_NB_model.fit(train_data, train_target)

booster_NB_model.score(test_data, test_target)

0.5712161023251202

## SVM

In [16]:
lsvc_model = load('Models/svm_small.joblib')

In [17]:
lsvc_model.score(test_data, test_target)

0.769718903376134

In [18]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

booster_LSVC_model = Pipeline([('vec', vectorizer),
                               ('boo', AdaBoostClassifier(lsvc_model.get_params()['clf'], algorithm='SAMME'))])

booster_LSVC_model.fit(train_data, train_target)

booster_LSVC_model.score(test_data, test_target)

0.7625799415001735

is there reason we chose count/TFIDF for diff cases?

## Which model performed the best?

### Yeah, SVM!
Let's take a look at how it performs on all the comment data.

In [23]:
# Full Comments File
comments_file = 'Data/combined_comments.csv'

# Read in File
comments = pd.read_csv(comments_file).drop('Unnamed: 0', axis=1)

# Filter out articles with "Unknown" Section Name
filt = (comments.sectionName != "Unknown") & (comments.commentType == "comment") & comments.sectionName.notna()
data = comments.commentBody[filt]
sections = comments.sectionName[filt]

# Train/Test Split
train_data, test_data, train_target, test_target = train_test_split(data, sections, random_state=RANDOM_STATE)

In [24]:
lsvc_full_model = load('Models/svm_full.joblib')

In [25]:
lsvc_full_model.score(test_data, test_target)

0.7364559223902017

In [26]:
vectorizer = TfidfVectorizer(input='content',
                            analyzer='word',
                            stop_words='english',
                            decode_error='ignore')

booster_LSVC_full_model = Pipeline([('vec', vectorizer),
                               ('boo', AdaBoostClassifier(lsvc_full_model.get_params()['clf'], algorithm='SAMME'))])

booster_LSVC_full_model.fit(train_data, train_target)

booster_LSVC_full_model.score(test_data, test_target)

0.7234060514819166

# SVM cont., the best model trained on newDesk + material_label

why assume SVM will be best for newDesk + material_label as well?

In [None]:
# Full File
everything = 'Data/combined_everything.csv'

# Read in File
full_data = pd.read_csv(everything)

# Filter out articles with "Unknown" Section Name
filt = (full_data.sectionName != "Unknown") & (full_data.commentType == "comment")
full_filt = full_data[filt].copy().reset_index()

In [None]:
# Transform and map newDesk categories 
newDesk_le = LabelEncoder()
newDesk_labels = newDesk_le.fit_transform(full_filt['newDesk'])
full_filt['newDesk_label'] = newDesk_labels

# Encoding newDesk
newDesk_ohe = OneHotEncoder()
newDesk_feature_arr = newDesk_ohe.fit_transform(full_filt[['newDesk_label']]).toarray()
newDesk_feature_labels = list(newDesk_le.classes_)
newDesk_features = pd.DataFrame(newDesk_feature_arr, columns=newDesk_feature_labels)

# Transform and map typeOfMaterial categories 
material_le = LabelEncoder()
material_labels = material_le.fit_transform(full_filt['typeOfMaterial'])
full_filt['material_label'] = material_labels

# Encoding typeOfMaterial
material_ohe = OneHotEncoder()
material_feature_arr = material_ohe.fit_transform(full_filt[['material_label']]).toarray()
material_feature_labels = list(material_le.classes_)
material_features = pd.DataFrame(material_feature_arr, columns=material_feature_labels)

# Subsetting comment data frame and concatenating with new features
full_filt_sub = full_filt.loc[:,['commentBody', 'sectionName']]
full_df_ohe = pd.concat([full_filt_sub, newDesk_features, material_features], axis=1)

# Split to Data and Labels
data = full_df_ohe.loc[:,newDesk_feature_labels]
sections = full_df_ohe.sectionName

### What about the 'Unknown' section name?

In [None]:
# Train/Test Split
train_data_no_unknown, test_data_no_unknown, train_target_no_unknown, test_target_no_unknown = train_test_split(data, sections, random_state=RANDOM_STATE)
train_target_no_unknown = train_target_no_unknown.astype(str)
test_target_no_unknown = test_target_no_unknown.astype(str)

In [None]:
lsvc_ohe_full_model = load('Models/ohe_linearsvm_full.joblib')

In [None]:
lsvc_ohe_full_model.score(test_data_no_unknown, test_target_no_unknown)

LDA is an unsupervised learning technique that is often used in topic modeling - was developed by Andrew Ng in 2003.
Q: Did we use LDA as the 'ground truth'?
Q: What does the prediction_results table tell us?
Didn't quite work

In [None]:
prediction_results = load('Models/prediction_results.joblib')
prediction_results

Accuracy over # of data points

In [None]:
acc = pd.read_csv('Data/acc_datasize1.csv').drop('Unnamed: 0', axis=1)
df_summ = acc.groupby('size').agg({'acc':['mean','std']})
data = pd.DataFrame(df_summ)
data = data.reset_index()


In [None]:
plt.figure()
plt.plot(data['size'],data.acc['mean'],color = 'black')
plt.fill_between(data['size'],data.acc['mean']+data.acc['std'],data.acc['mean']-data.acc['std'],alpha = .5)
plt.title('mean of accuracy over data size')
plt.margins(x=0, y=-.025) #zoom
plt.ylim((.56,.72))

what was this?

# Conclusion

1. SVM was the best model (Why?)
2. Comments did end up being the best attribute for predicting section name
3. The big problem: Unknowns
4. And next time...
    Combine vectorized data for comments with other features
    Unsupervised learning on all features to see clusters
    PCA transformation on features
    Try to classify those unknowns