# Visualization of Articles

In [13]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly as plty

import missingno as msno

from wordcloud import WordCloud
from pandas_profiling import ProfileReport


# Load Either Full Data or Sample

In [18]:
if load_sample:
    # Load Sample Data
    EDA_note = '_sample'
    df_test = pd.read_pickle('.//data//processed//df_test_sample.pkl')
    df_train = pd.read_pickle('.//data//processed//df_train_sample.pkl')
    df_comments = pd.read_pickle('.//data//processed//df_comments_sample.pkl')
    df_articles = pd.read_pickle('.//data//processed//df_articles_verbose_sample.pkl')

if not load_sample:
    # Load Real Data
    EDA_note = '_full'
    df_test = pd.read_pickle('.//data//processed//df_test.pkl')
    df_train = pd.read_pickle('.//data//processed//df_train.pkl')
    df_comments = pd.read_pickle('.//data//processed//df_comments.pkl')
    df_articles = pd.read_pickle('.//data//processed//df_articles_verbose.pkl')


# Drop constant columns, userTitle is 99% empty
df_comments.drop(columns=['status','trusted','recommendedFlag','isAnonymous','userTitle'],inplace=True)


# Convert Timestamps where necessary
df_articles['pub_date'] = pd.to_datetime(df_articles['pub_date'])
df_comments['createDate'] = pd.to_datetime(df_comments['createDate'])
df_comments['updateDate'] = pd.to_datetime(df_comments['updateDate'])
df_comments['approveDate'] = pd.to_datetime(df_comments['approveDate'])


FileNotFoundError: [Errno 2] No such file or directory: './/data//processed//df_test.pkl'

# Generate Pandas Profiles for EDA

In [7]:
profile_test = ProfileReport(df_test, title="New York Times - Test Dataframe")
profile_train = ProfileReport(df_train, title="New York Times - Train Dataframe")

profile_articles = ProfileReport(df_articles, title="New York Times - Articles Dataframe")

profile_test.to_file('..//reports//figures//EDA_Test.html')
profile_train.to_file('..//reports//figures//EDA_Train.html')
profile_articles.to_file('..//reports//figures//EDA_Articles.html')


Summarize dataset: 100%|██████████| 30/30 [00:06<00:00,  4.73it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.31s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00,  9.41it/s]
Summarize dataset: 100%|██████████| 37/37 [00:04<00:00,  8.74it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.78s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 21.82it/s]
Summarize dataset: 100%|██████████| 51/51 [00:06<00:00,  7.62it/s, Completed]                            
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 28.75it/s]


In [8]:
# This is LARGE and only a sample
profile_comments = ProfileReport(df_comments, title="New York Times - Comments Dataframe",minimal=True)
profile_comments.to_file('..//reports//figures//EDA_Comments.html')

Summarize dataset: 100%|██████████| 25/25 [00:01<00:00, 13.02it/s, Completed]                              
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.33s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 15.18it/s]


In [10]:
df_articles.head(1).T

Unnamed: 0,3608
newsdesk,Business
section,Business Day
subsection,Economy
material,News
headline,"Layoffs Are Just Starting, and the Forecasts A..."
abstract,Shutdowns in the U.S. retail and hospitality b...
keywords,"['Coronavirus (2019-nCoV)', 'Layoffs and Job R..."
word_count,1474
pub_date,2020-03-17 22:28:06+00:00
n_comments,441


In [11]:
df_articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 839 entries, 3608 to 5454
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   newsdesk             839 non-null    object             
 1   section              839 non-null    object             
 2   subsection           310 non-null    object             
 3   material             839 non-null    object             
 4   headline             839 non-null    object             
 5   abstract             838 non-null    object             
 6   keywords             839 non-null    object             
 7   word_count           839 non-null    int64              
 8   pub_date             839 non-null    datetime64[ns, UTC]
 9   n_comments           839 non-null    int64              
 10  uniqueID             839 non-null    object             
 11  uri                  839 non-null    object             
 12  print_section     

In [None]:
msno.matrix(df_test)

In [None]:
df_train.info()

In [None]:
msno.matrix(df_train)

In [None]:
df_articles.info()

In [None]:
msno.matrix(df_articles)

In [None]:
sns.pairplot(df_articles)

In [None]:
df_articles.iloc[0]

# Comments

In [None]:
profile = ProfileReport(df_comments, title="New York Times Comments")
profile.to_file('..//reports//figures//pp_Comments.html')

In [None]:
df_comments.info()

In [None]:
df_comments['status'].unique()

In [None]:
len(df_comments['commentID'].unique())

In [None]:
len(df_comments['commentSequence'].unique())

In [None]:
len(df_comments['userID'].unique())

There are repeat customers

In [None]:
df_comments['value_']

In [None]:
# Missing
msno.matrix(df_comments)

In [None]:
# This was inconsequential
#sns.pairplot(df_comments[['recommendations','replyCount','depth','trusted','recommendedFlag']])

In [None]:
df_comments.iloc[15]

In [None]:
df_comments[df_comments['commentID'] == 104388469]

In [None]:
df_comments.iloc[15]['commentBody']

In [None]:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
wordcloud = WordCloud().generate(df_comments.iloc[15]['commentBody'])
plt.figure()

# Comment Investigation

To Investigate
-   Comments
    - commentType: is 1 of three categories (comment, userReply or reporterReply)
    - commentBody - There is no classification of article, this is the comment text
        - IDEA : Sentiment Analysis here
    - userID : There are some repeats here
    - recommendations :
    - replyCount : 
    - editorsSelection : Almost entirely False
    - depth
- Model Ideas - Text Classification
    - Naive Bayes for classifier
    - Logistic Regression
    - SVM
    - Sentiment Analysis
    - Keyphrase extraction
- Summarizer - Get the jist of the comment