In [1]:
import pandas as pd
from textblob import TextBlob
import nltk

In [2]:
# load data as dataframe

df = pd.read_pickle('data/re_merge/clean.pickle')
df['hl_lead'] = df['hl'] + ' '+ df['lead']
df.reset_index(inplace=True)
df.head(3)

Unnamed: 0,index,nyt_id,org,hl,lead,date_pub,hl_lead
0,3,4fc0a09745c1498b0d3ba216,none,marijuana smoking is reported safe; hemp leave...,a panaman judge recently sentenced an american...,1926-11-21,marijuana smoking is reported safe; hemp leave...
1,9,4fc1d8e345c1498b0d4ccb9f,none,use of marijuana spreading in west; poisonous ...,although as appalling in its effects on the hu...,1934-09-16,use of marijuana spreading in west; poisonous ...
2,12,4fc1ebab45c1498b0d528e5b,the associated press,rhode island to end weed as drug source; state...,"providence, r.i., jan. 19. -rhode island autho...",1935-01-20,rhode island to end weed as drug source; state...


In [25]:
# export leads as list of strings
def df_to_li(dataframe, col_to_li):
    li = dataframe[[col_to_li]].head().values.tolist()
    li = [x[0] for x in li]
    return li

li = df_to_li(df, 'hl_lead'); li

["marijuana smoking is reported safe; hemp leaves, classed in some states with drugs, tested by canal zone committee. experiment made on 17 panaman judge recently gave sailor a year's sentence for possessing the weed. a panaman judge recently sentenced an american seaman, hamilton main, to a year of penal confinement for smoking and having in his possession cigarettes made of the leaves of the cannabis indica, known also as marijuana, canjac and by various other names, and often incorrectly referred to as hashish.",
 'use of marijuana spreading in west; poisonous weed is being sold quite freely in pool halls and beer gardens. children said to buy it narcotic bureau officials say law gives no authority to stop traffic. although as appalling in its effects on the human mind and body as narcotics, the consumption of marijuana appears to be proceeding, virtually unchecked in colorado and other western states with a large spanish-american population.',
 'rhode island to end weed as drug sou

In [28]:
# stem and tokenize words

def stem_li(li):    
    stemmer = nltk.stem.porter.PorterStemmer()
    li_stemmed = []
    for article in li:
        article_temp = []
        for word in TextBlob(article).words:
            stemmed_word = stemmer.stem(word)
            article_temp.append(stemmed_word)
        li_stemmed.append(article_temp)
    return li_stemmed

li_stemmed = stem_li(li); li_stemmed

[[u'marijuana',
  u'smoke',
  u'is',
  u'report',
  u'safe',
  u'hemp',
  u'leav',
  u'class',
  u'in',
  u'some',
  u'state',
  u'with',
  u'drug',
  u'test',
  u'by',
  u'canal',
  u'zone',
  u'committe',
  u'experi',
  u'made',
  u'on',
  u'17',
  u'panaman',
  u'judg',
  u'recent',
  u'gave',
  u'sailor',
  u'a',
  u'year',
  u"'s",
  u'sentenc',
  u'for',
  u'possess',
  u'the',
  u'weed',
  u'a',
  u'panaman',
  u'judg',
  u'recent',
  u'sentenc',
  u'an',
  u'american',
  u'seaman',
  u'hamilton',
  u'main',
  u'to',
  u'a',
  u'year',
  u'of',
  u'penal',
  u'confin',
  u'for',
  u'smoke',
  u'and',
  u'have',
  u'in',
  u'hi',
  u'possess',
  u'cigarett',
  u'made',
  u'of',
  u'the',
  u'leav',
  u'of',
  u'the',
  u'cannabi',
  u'indica',
  u'known',
  u'also',
  u'as',
  u'marijuana',
  u'canjac',
  u'and',
  u'by',
  u'variou',
  u'other',
  u'name',
  u'and',
  u'often',
  u'incorrectli',
  u'refer',
  u'to',
  u'as',
  u'hashish'],
 [u'use',
  u'of',
  u'marijuana',
  u'

In [31]:
# remove stopwords
from nltk.corpus import stopwords

def remove_stopwords(li):
    li_sw = []
    for article in li:
        filtered_words = [word for word in article if word not in stopwords.words('english')]
        li_sw.append(filtered_words)
    return li_sw
li_sw = remove_stopwords(li_stemmed); li_sw

[[u'marijuana',
  u'smoke',
  u'report',
  u'safe',
  u'hemp',
  u'leav',
  u'class',
  u'state',
  u'drug',
  u'test',
  u'canal',
  u'zone',
  u'committe',
  u'experi',
  u'made',
  u'17',
  u'panaman',
  u'judg',
  u'recent',
  u'gave',
  u'sailor',
  u'year',
  u"'s",
  u'sentenc',
  u'possess',
  u'weed',
  u'panaman',
  u'judg',
  u'recent',
  u'sentenc',
  u'american',
  u'seaman',
  u'hamilton',
  u'main',
  u'year',
  u'penal',
  u'confin',
  u'smoke',
  u'hi',
  u'possess',
  u'cigarett',
  u'made',
  u'leav',
  u'cannabi',
  u'indica',
  u'known',
  u'also',
  u'marijuana',
  u'canjac',
  u'variou',
  u'name',
  u'often',
  u'incorrectli',
  u'refer',
  u'hashish'],
 [u'use',
  u'marijuana',
  u'spread',
  u'west',
  u'poison',
  u'weed',
  u'sold',
  u'quit',
  u'freeli',
  u'pool',
  u'hall',
  u'beer',
  u'garden',
  u'children',
  u'said',
  u'buy',
  u'narcot',
  u'bureau',
  u'offici',
  u'say',
  u'law',
  u'give',
  u'author',
  u'stop',
  u'traffic',
  u'although',


In [30]:
# join words
def join_words(li):
    documents = []
    for article in li_sw:
        documents.append(' '.join(article))
    documents = [str(x) for x in documents]
    return documents
documents = join_words(li_sw); documents

["marijuana smoke report safe hemp leav class state drug test canal zone committe experi made 17 panaman judg recent gave sailor year 's sentenc possess weed panaman judg recent sentenc american seaman hamilton main year penal confin smoke hi possess cigarett made leav cannabi indica known also marijuana canjac variou name often incorrectli refer hashish",
 'use marijuana spread west poison weed sold quit freeli pool hall beer garden children said buy narcot bureau offici say law give author stop traffic although appal effect human mind bodi narcot consumpt marijuana appear proceed virtual uncheck colorado western state larg spanish-american popul',
 'rhode island end weed drug sourc state plan drive erad marijuana plant wide traffic hashish provid r.i jan 19 rhode island author plan spring drive erad marijuana mexico weed long ha sourc larg suppli danger narcot drug known hashish',
 'polic studi marijuana kill grow crop enabl policemen familiar themselv appear marijuana pot narcot wee

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

def get_sparse_matrix(documents):
    #CountVectorizer is a class; so `vectorizer` below represents an instance of that object.
    vectorizer = CountVectorizer(ngram_range=(1,2))

    # call `fit` to build the vocabulary
    vectorizer.fit(documents)

    # then, use `get_feature_names` to return the tokens
    #print vectorizer.get_feature_names()

    # finally, call `transform` to convert text to a bag of words
    sparse_matrix = vectorizer.transform(documents)
    return sparse_matrix
sparse_matrix = get_sparse_matrix(documents); sparse_matrix

<5x335 sparse matrix of type '<type 'numpy.int64'>'
	with 363 stored elements in Compressed Sparse Row format>

In [51]:
def get_tf_df(sparse_matrix):
    x_back = sparse_matrix.toarray()
    df_tf = pd.DataFrame(x_back, columns=vectorizer.get_feature_names())
    return df_tf
df_tf = get_tf_df(sparse_matrix); df_tf.head(3)

Unnamed: 0,000,000 000,000 bonfir,17,17 panaman,19,19 rhode,also,also marijuana,although,...,western state,wide,wide traffic,year,year penal,year sentenc,yesterday,yesterday presenc,zone,zone committe
0,0,0,0,1,1,0,0,1,1,0,...,0,0,0,2,1,1,0,0,1,1
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0


In [36]:
# IDF: inverse frequency in the corpus
def calc_idf(dataframe):
    num_docs = dataframe.shape[0]
    df_count = dataframe.astype(bool).sum(axis=0)
    idf = np.log2(num_docs / df_count)
    return idf
idf = calc_idf(df_tf); idf

000                  2.321928
000 000              2.321928
000 bonfir           2.321928
17                   2.321928
17 panaman           2.321928
19                   2.321928
19 rhode             2.321928
also                 2.321928
also marijuana       2.321928
although             2.321928
although appal       2.321928
american             1.321928
american popul       2.321928
american seaman      2.321928
appal                2.321928
appal effect         2.321928
appear               1.321928
appear marijuana     2.321928
appear proceed       2.321928
assembl              2.321928
assembl room         2.321928
author               1.321928
author plan          2.321928
author stop          2.321928
bagatel              2.321928
bagatel machin       2.321928
beer                 2.321928
beer garden          2.321928
bodi                 2.321928
bodi narcot          2.321928
                       ...   
valentin             2.321928
valentin captain     2.321928
variou    

In [50]:
df_tf * idf

#a = np.array([[2, 2, 2], [4, 4, 3]]); a
#b = np.array([1, 2, 4]); b
#a*b

Unnamed: 0,000,000 000,000 bonfir,17,17 panaman,19,19 rhode,also,also marijuana,although,...,western state,wide,wide traffic,year,year penal,year sentenc,yesterday,yesterday presenc,zone,zone committe
0,0.0,0.0,0.0,2.321928,2.321928,0.0,0.0,2.321928,2.321928,0.0,...,0.0,0.0,0.0,4.643856,2.321928,2.321928,0.0,0.0,2.321928,2.321928
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.321928,...,2.321928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,2.321928,2.321928,0.0,0.0,0.0,...,0.0,2.321928,2.321928,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.643856,2.321928,2.321928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.321928,2.321928,0.0,0.0
