In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS

In [2]:
nyt = pd.read_csv('nyt_ftpg_1996_2006.csv', encoding='ISO-8859-1')

In [3]:
nyt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31034 entries, 0 to 31033
Data columns (total 18 columns):
Article_ID                31034 non-null int64
Date                      31034 non-null object
Article_Sequence          31034 non-null object
Title                     31034 non-null object
Summary                   31034 non-null object
Topic_6digit              31034 non-null int64
Topic_4digit              31034 non-null int64
Topic_2digit              31034 non-null int64
War on Terror             31034 non-null int64
Katrina                   31034 non-null int64
Israel/Palestine          31034 non-null int64
Immigration               31034 non-null int64
Presidential Elections    31034 non-null int64
Clinton Impeachment       31034 non-null int64
Enron                     31034 non-null int64
Darfur                    31034 non-null int64
Race/Ethnicity            31034 non-null int64
Schiavo                   31034 non-null int64
dtypes: int64(14), object(4)
memory usage

In [4]:
nyt.head()

Unnamed: 0,Article_ID,Date,Article_Sequence,Title,Summary,Topic_6digit,Topic_4digit,Topic_2digit,War on Terror,Katrina,Israel/Palestine,Immigration,Presidential Elections,Clinton Impeachment,Enron,Darfur,Race/Ethnicity,Schiavo
0,1,1/1/1996,a,Nation's Smaller Jails Struggle To Cope With S...,Jails overwhelmed with hardened criminals,120500,1205,12,0,0,0,0,0,0,0,0,0,0
1,2,1/1/1996,b,Dancing (and Kissing) In the New Year,new years activities,280000,2800,28,0,0,0,0,0,0,0,0,0,0
2,3,1/1/1996,c,Forbes's Silver Bullet for the Nation's Malaise,Steve Forbes running for President,201201,2012,20,0,0,0,0,1,0,0,0,0,0
3,4,1/1/1996,d,"Up at Last, Bridge to Bosnia Is Swaying Gatewa...",U.S. military constructs bridge to help their ...,160200,1602,16,0,0,0,0,0,0,0,0,0,0
4,5,1/1/1996,e,2 SIDES IN SENATE DISAGREE ON PLAN TO END FURL...,Democrats and Republicans can't agree on plan ...,201206,2012,20,0,0,0,0,0,0,0,0,0,0


In [5]:
from nltk.corpus import stopwords
import string

In [6]:
title1 = nyt['Title'][0]

In [7]:
title1

"Nation's Smaller Jails Struggle To Cope With Surge in Inmates "

In [8]:
def text_process(mess):
    """
    1. remove punc
    2. remove stop words
    3. return list of clean text words
    """

    nopunc = [char for char in mess if char not in string.punctuation]
    
    nopunc = ''.join(nopunc)

    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [9]:
text_process(title1)

['Nations', 'Smaller', 'Jails', 'Struggle', 'Cope', 'Surge', 'Inmates']

In [10]:
nyt_processed = nyt[['Title','Topic_2digit']]

In [11]:
nyt_processed.head()

Unnamed: 0,Title,Topic_2digit
0,Nation's Smaller Jails Struggle To Cope With S...,12
1,Dancing (and Kissing) In the New Year,28
2,Forbes's Silver Bullet for the Nation's Malaise,20
3,"Up at Last, Bridge to Bosnia Is Swaying Gatewa...",16
4,2 SIDES IN SENATE DISAGREE ON PLAN TO END FURL...,20


In [12]:
nyt_processed['Title'].head().apply(text_process)

0    [Nations, Smaller, Jails, Struggle, Cope, Surg...
1                        [Dancing, Kissing, New, Year]
2          [Forbess, Silver, Bullet, Nations, Malaise]
3        [Last, Bridge, Bosnia, Swaying, Gateway, GIs]
4    [2, SIDES, SENATE, DISAGREE, PLAN, END, FURLOU...
Name: Title, dtype: object

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(nyt_processed['Title'])

In [16]:
print(len(bow_transformer.vocabulary_))

25226


In [17]:
nyt_bow = bow_transformer.transform(nyt_processed['Title'])

In [18]:
print('Shape of Sparse Matrix: ', nyt_bow.shape)

Shape of Sparse Matrix:  (31034, 25226)


In [19]:
nyt_bow.nnz

184136

In [20]:
sparsity = (100.0 * nyt_bow.nnz / (nyt_bow.shape[0] * nyt_bow.shape[1]))
print('sparsity: {}'.format((sparsity)))

sparsity: 0.023520825370154735


In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

In [22]:
tfidf_transformer = TfidfTransformer().fit(nyt_bow)

In [23]:
nyt_tfidf = tfidf_transformer.transform(nyt_bow)

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
nyt_model = MultinomialNB().fit(nyt_tfidf,nyt_processed['Topic_2digit'])

In [26]:
all_pred = nyt_model.predict(nyt_tfidf)

In [27]:
from sklearn.cross_validation import train_test_split



In [28]:
nyt_train,nyt_test,topic_train,topic_test = train_test_split(nyt_processed['Title'], nyt_processed['Topic_2digit'], test_size=0.3)

In [29]:
from sklearn.pipeline import Pipeline

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [32]:
pipeline.fit(nyt_train, topic_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x000000000D542A60>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocesso...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [36]:
predictions = pipeline.predict(nyt_test)

In [50]:
a = pd.Series(predictions)
    

In [52]:
a.value_counts()

19    5192
16    1807
20    1699
3      193
12     162
29     146
1       38
15      37
6       19
28       4
2        4
17       4
24       4
10       2
dtype: int64

In [34]:
from sklearn.metrics import classification_report

In [35]:
print(classification_report(topic_test,predictions))

             precision    recall  f1-score   support

          1       0.92      0.12      0.21       301
          2       0.75      0.01      0.02       278
          3       0.88      0.31      0.46       547
          4       0.00      0.00      0.00        45
          5       0.00      0.00      0.00       214
          6       0.95      0.07      0.13       250
          7       0.00      0.00      0.00       104
          8       0.00      0.00      0.00        89
         10       1.00      0.01      0.02       191
         12       0.79      0.20      0.32       637
         13       0.00      0.00      0.00        83
         14       0.00      0.00      0.00       120
         15       0.78      0.08      0.15       349
         16       0.51      0.68      0.59      1353
         17       1.00      0.02      0.04       203
         18       0.00      0.00      0.00        72
         19       0.32      0.88      0.47      1911
         20       0.56      0.78      0.65   

  'precision', 'predicted', average, warn_for)


In [51]:
pipeline2 = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())
])

In [52]:
pipeline2.fit(nyt_train, topic_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x000000000D5942F0>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocesso...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [53]:
predictions2 = pipeline.predict(nyt_test)

In [54]:
print(classification_report(topic_test,predictions2))

             precision    recall  f1-score   support

          1       0.83      0.12      0.21       298
          2       1.00      0.02      0.04       272
          3       0.83      0.29      0.43       538
          4       0.00      0.00      0.00        59
          5       1.00      0.00      0.01       222
          6       0.91      0.04      0.07       268
          7       0.00      0.00      0.00       102
          8       0.00      0.00      0.00        91
         10       1.00      0.01      0.01       189
         12       0.77      0.20      0.32       636
         13       0.00      0.00      0.00        72
         14       0.00      0.00      0.00       103
         15       0.82      0.06      0.12       368
         16       0.50      0.68      0.58      1345
         17       1.00      0.01      0.02       218
         18       0.00      0.00      0.00        71
         19       0.33      0.89      0.48      1946
         20       0.52      0.76      0.62   

  'precision', 'predicted', average, warn_for)


In [55]:
nyt.describe()

Unnamed: 0,Article_ID,Topic_6digit,Topic_4digit,Topic_2digit,War on Terror,Katrina,Israel/Palestine,Immigration,Presidential Elections,Clinton Impeachment,Enron,Darfur,Race/Ethnicity,Schiavo
count,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0,31034.0
mean,15517.5,164152.89866,1641.543855,16.279532,0.136205,0.005478,0.034253,0.012341,0.052459,0.014307,0.005993,0.000838,0.014951,0.000741
std,8958.888463,96256.828146,962.52327,9.661223,0.343012,0.073811,0.181881,0.110406,0.222954,0.118755,0.077186,0.028933,0.12136,0.027214
min,1.0,10000.0,100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7759.25,120800.0,1208.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15517.5,170600.0,1706.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,23275.75,201000.0,2010.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,31034.0,990000.0,9900.0,99.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [56]:
nyt_train.describe()

count                                    21723
unique                                   21308
top       CRISIS IN THE BALKANS: THE OVERVIEW;
freq                                        30
Name: Title, dtype: object

In [57]:
topic_train.describe()

count    21723.000000
mean        16.252359
std          9.491754
min          1.000000
25%         12.000000
50%         17.000000
75%         20.000000
max         99.000000
Name: Topic_2digit, dtype: float64

In [65]:
topic_test.value_counts()

19    1946
16    1345
20    1181
12     636
3      538
29     369
15     368
1      298
2      272
6      268
28     240
5      222
17     218
24     201
10     189
26     165
14     103
7      102
31      98
8       91
30      81
21      79
13      72
18      71
99      62
4       59
27      37
Name: Topic_2digit, dtype: int64

In [66]:
nyt_test.value_counts()

BASEBALL;                                                                                      14
CRISIS IN THE BALKANS: THE OVERVIEW;                                                           14
INTERNATIONAL BUSINESS;                                                                        13
TESTING OF A PRESIDENT: THE OVERVIEW;                                                          11
CONFLICT IN THE BALKANS: THE OVERVIEW;                                                          6
TESTING OF A PRESIDENT: THE PRESIDENT;                                                          5
 SALT LAKE 2002                                                                                 5
THE MARKETS: Market Place;                                                                      5
CRISIS IN THE BALKANS;                                                                          4
NUCLEAR ANXIETY: THE OVERVIEW;                                                                  4
TESTING OF A PRESIDE

Versão Com Stopwords

In [8]:
def text_process(mess):
    """
    1. remove punc
    2. remove stop words
    3. return list of clean text words
    """

    nopunc = [char for char in mess if char not in string.punctuation]
    
    nopunc = ''.join(nopunc)

    return [word for word in nopunc.split() ]

In [9]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [29]:
nyt_processed['Topic_2digit'].value_counts()

19    6354
16    4479
20    3958
12    2088
3     1799
29    1273
15    1249
1      964
2      914
6      912
28     769
5      749
17     719
24     715
10     594
26     573
14     410
7      354
31     329
8      299
13     273
21     269
30     268
18     254
99     172
4      168
27     129
Name: Topic_2digit, dtype: int64